@@ -151,6 +151,7 @@
#define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
#define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
#define MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/
/*
* 3D instructions used by the kernel
@@ -226,6 +227,29 @@
#define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0)
#define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
+#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
+#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
+/* operands */
+#define MI_ALU_OP_NOOP 0
+#define MI_ALU_OP_LOAD 128
+#define MI_ALU_OP_LOADINV 1152
+#define MI_ALU_OP_LOAD0 129
+#define MI_ALU_OP_LOAD1 1153
+#define MI_ALU_OP_ADD 256
+#define MI_ALU_OP_SUB 257
+#define MI_ALU_OP_AND 258
+#define MI_ALU_OP_OR 259
+#define MI_ALU_OP_XOR 260
+#define MI_ALU_OP_STORE 384
+#define MI_ALU_OP_STOREINV 1408
+/* sources */
+#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */
+#define MI_ALU_SRC_SRCA 32
+#define MI_ALU_SRC_SRCB 33
+#define MI_ALU_SRC_ACCU 49
+#define MI_ALU_SRC_ZF 50
+#define MI_ALU_SRC_CF 51
+
/*
* Commands used only by the command parser
*/
@@ -73,6 +73,11 @@ enum intel_gt_scratch_field {
/* 8 bytes */
INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
+ /* 6 * 8 bytes */
+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+ /* 4 bytes */
+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
};
#endif /* __INTEL_GT_TYPES_H__ */
@@ -3653,6 +3653,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+ struct drm_i915_private *i915 = data;
+
+ /* This would lead to infinite waits as we're doing timestamp
+ * difference on the CS with only 32bits.
+ */
+ if (val > ((1ul << 32) - 1) * RUNTIME_INFO(i915)->cs_timestamp_frequency_khz)
+ return -EINVAL;
+
+ atomic64_set(&i915->perf.oa.noa_programming_delay, val);
+ return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+ struct drm_i915_private *i915 = data;
+
+ *val = atomic64_read(&i915->perf.oa.noa_programming_delay);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+ i915_perf_noa_delay_get,
+ i915_perf_noa_delay_set,
+ "%llu\n");
+
+
#define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2)
@@ -4418,6 +4448,7 @@ static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
} i915_debugfs_files[] = {
+ {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
{"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
@@ -1834,6 +1834,14 @@ struct drm_i915_private {
struct i915_oa_ops ops;
const struct i915_oa_format *oa_formats;
+
+ /**
+ * A batch buffer doing a wait on the GPU for the NOA
+ * logic to be reprogrammed.
+ */
+ struct i915_vma *noa_wait;
+
+ atomic64_t noa_programming_delay;
} oa;
} perf;
@@ -197,6 +197,7 @@
#include "gem/i915_gem_context.h"
#include "gem/i915_gem_pm.h"
+#include "gt/intel_gt.h"
#include "gt/intel_lrc_reg.h"
#include "i915_drv.h"
@@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
config_length += oa_config->flex_regs_len * 8;
}
- config_length += 4; /* MI_BATCH_BUFFER_END */
+ config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
bo = i915_gem_object_create_shmem(i915, config_length);
@@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
- *cs++ = MI_BATCH_BUFFER_END;
+
+ /* Jump into the NOA wait busy loop. */
+ *cs++ = (INTEL_GEN(i915) < 8 ?
+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
+ *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
+ *cs++ = 0;
i915_gem_object_flush_map(bo);
i915_gem_object_unpin_map(bo);
@@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
mutex_lock(&dev_priv->drm.struct_mutex);
dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
mutex_unlock(&dev_priv->drm.struct_mutex);
free_oa_buffer(dev_priv);
@@ -1653,6 +1660,201 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
return ret;
}
+static u32 *save_restore_register(struct drm_i915_private *i915, u32 *cs,
+ bool save, i915_reg_t reg, u32 offset,
+ u32 dword_count)
+{
+ uint32_t d;
+
+ for (d = 0; d < dword_count; d++) {
+ if (save) {
+ *cs++ = INTEL_GEN(i915) >= 8 ?
+ MI_STORE_REGISTER_MEM_GEN8 :
+ MI_STORE_REGISTER_MEM;
+ } else {
+ *cs++ = INTEL_GEN(i915) >= 8 ?
+ MI_LOAD_REGISTER_MEM_GEN8 :
+ MI_LOAD_REGISTER_MEM;
+ }
+ *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
+ *cs++ = 0;
+ }
+
+ return cs;
+}
+
+static int alloc_noa_wait(struct drm_i915_private *i915)
+{
+ struct drm_i915_gem_object *bo;
+ struct i915_vma *vma;
+ const u64 delay_ticks = 0xffffffffffffffff -
+ DIV64_U64_ROUND_UP(
+ atomic64_read(&i915->perf.oa.noa_programming_delay) *
+ RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
+ 1000000ull);
+ u32 *batch, *ts0, *cs, *jump;
+ int ret, i;
+ enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR };
+
+ bo = i915_gem_object_create_internal(i915, 4096);
+ if (IS_ERR(bo)) {
+ DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
+ return PTR_ERR(bo);
+ }
+
+ /*
+ * We pin in GGTT because we jump into this buffer now because
+ * multiple OA config BOs will have a jump to this address and it
+ * needs to be fixed during the lifetime of the i915/perf stream.
+ */
+ vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto err_unref;
+ }
+
+ batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+ if (IS_ERR(batch)) {
+ ret = PTR_ERR(batch);
+ goto err_unpin;
+ }
+
+ /* Save registers. */
+ for (i = 0; i < N_CS_GPR; i++) {
+ cs = save_restore_register(
+ i915, cs, true /* save */, HSW_CS_GPR(i),
+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+ }
+ cs = save_restore_register(
+ i915, cs, true /* save */, MI_PREDICATE_RESULT_1,
+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+ /* First timestamp snapshot location. */
+ ts0 = cs;
+
+ /*
+ * Initial snapshot of the timestamp register to implement the wait.
+ * We work with 32b values, so clear out the top 32b bits of the
+ * register because the ALU works 64bits.
+ */
+ *cs++ = MI_LOAD_REGISTER_IMM(1);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4;
+ *cs++ = 0;
+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS));
+
+ /*
+ * This is the location we're going to jump back into until the
+ * required amount of time has passed.
+ */
+ jump = cs;
+
+ /*
+ * Take another snapshot of the timestamp register. Take care to clear
+ * up the top 32bits of CS_GPR(1) as we're using it for other
+ * operations below.
+ */
+ *cs++ = MI_LOAD_REGISTER_IMM(1);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4;
+ *cs++ = 0;
+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS));
+
+ /*
+ * Do a diff between the 2 timestamps and store the result back into
+ * CS_GPR(1).
+ */
+ *cs++ = MI_MATH(5);
+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS));
+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS));
+ *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU);
+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
+
+ /*
+ * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+ * timestamp have rolled over the 32bits) into the predicate register
+ * to be used for the predicated jump.
+ */
+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+ /* Restart from the beginning if we had timestamps roll over. */
+ *cs++ = (INTEL_GEN(i915) < 8 ?
+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
+ MI_BATCH_PREDICATE;
+ *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
+ *cs++ = 0;
+
+ /*
+ * Now add the diff between to previous timestamps and add it to :
+ * (((1 * << 64) - 1) - delay_ns)
+ *
+ * When the Carry Flag contains 1 this means the elapsed time is
+ * longer than the expected delay, and we can exit the wait loop.
+ */
+ *cs++ = MI_LOAD_REGISTER_IMM(2);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET));
+ *cs++ = lower_32_bits(delay_ticks);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4;
+ *cs++ = upper_32_bits(delay_ticks);
+
+ *cs++ = MI_MATH(4);
+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS));
+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET));
+ *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
+ *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
+
+ /*
+ * Transfer the result into the predicate register to be used for the
+ * predicated jump.
+ */
+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+ /* Predicate the jump. */
+ *cs++ = (INTEL_GEN(i915) < 8 ?
+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
+ MI_BATCH_PREDICATE;
+ *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
+ *cs++ = 0;
+
+ /* Restore registers. */
+ for (i = 0; i < N_CS_GPR; i++) {
+ cs = save_restore_register(
+ i915, cs, false /* save */, HSW_CS_GPR(i),
+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+ }
+ cs = save_restore_register(
+ i915, cs, false /* save */, MI_PREDICATE_RESULT_1,
+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+ /* And return to the ring. */
+ *cs++ = MI_BATCH_BUFFER_END;
+
+ GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch)));
+
+ i915_gem_object_flush_map(bo);
+ i915_gem_object_unpin_map(bo);
+
+ i915->perf.oa.noa_wait = vma;
+
+ return 0;
+
+err_unpin:
+ __i915_vma_unpin(vma);
+
+err_unref:
+ i915_gem_object_put(bo);
+
+ return ret;
+}
+
static void config_oa_regs(struct drm_i915_private *dev_priv,
const struct i915_oa_reg *regs,
u32 n_regs)
@@ -2205,6 +2407,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
goto err_config;
}
+ ret = alloc_noa_wait(dev_priv);
+ if (ret) {
+ DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
+ goto err_noa_wait_alloc;
+ }
+
/* PRM - observability performance counters:
*
* OACONTROL, performance counter enable, note:
@@ -2257,6 +2465,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
+ mutex_lock(&dev_priv->drm.struct_mutex);
+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
+ mutex_unlock(&dev_priv->drm.struct_mutex);
+
+err_noa_wait_alloc:
+ put_oa_config(stream->oa_config);
+
err_config:
if (stream->ctx)
oa_put_render_ctx_id(stream);
@@ -3641,6 +3856,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
mutex_init(&dev_priv->perf.metrics_lock);
idr_init(&dev_priv->perf.metrics_idr);
+ atomic64_set(&dev_priv->perf.oa.noa_programming_delay,
+ 500 * 1000 /* 500us */);
+
dev_priv->perf.initialized = true;
}
}
@@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4)
#define MI_PREDICATE_SRC1 _MMIO(0x2408)
#define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4)
-
+#define MI_PREDICATE_DATA _MMIO(0x2410)
+#define MI_PREDICATE_RESULT _MMIO(0x2418)
+#define MI_PREDICATE_RESULT_1 _MMIO(0x241c)
#define MI_PREDICATE_RESULT_2 _MMIO(0x2214)
#define LOWER_SLICE_ENABLED (1 << 0)
#define LOWER_SLICE_DISABLED (0 << 0)