[v9,3/9] drm/i915/perf: implement active wait for noa configurations

Message ID	20190711094638.8022-4-lionel.g.landwerlin@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Lionel Landwerlin <lionel.g.landwerlin@intel.com> To: intel-gfx@lists.freedesktop.org Date: Thu, 11 Jul 2019 12:46:32 +0300 Message-Id: <20190711094638.8022-4-lionel.g.landwerlin@intel.com> In-Reply-To: <20190711094638.8022-1-lionel.g.landwerlin@intel.com> References: <20190711094638.8022-1-lionel.g.landwerlin@intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH v9 3/9] drm/i915/perf: implement active wait for noa configurations Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	drm/i915: Vulkan performance query support \| expand [v9,0/9] drm/i915: Vulkan performance query support [v9,1/9] drm/i915/perf: introduce a versioning of the i915-perf uapi [v9,2/9] drm/i915/perf: allow for CS OA configs to be created lazily [v9,3/9] drm/i915/perf: implement active wait for noa configurations [v9,4/9] drm/i915: introduce a mechanism to extend execbuf2 [v9,5/9] drm/i915: add syncobj timeline support [v9,6/9] drm/i915: add a new perf configuration execbuf parameter [v9,7/9] drm/i915/perf: allow holding preemption on filtered ctx [v9,8/9] drm/i915/perf: execute OA configuration from command stream [v9,9/9] drm/i915: add support for perf configuration queries

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index e7eff9db343e..4a66af38c87b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -151,6 +151,7 @@ #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) #define MI_BATCH_RESOURCE_STREAMER (1<<10) +#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/ /* * 3D instructions used by the kernel @@ -226,6 +227,29 @@ #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1) +#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2)) +/* operands */ +#define MI_ALU_OP_NOOP 0 +#define MI_ALU_OP_LOAD 128 +#define MI_ALU_OP_LOADINV 1152 +#define MI_ALU_OP_LOAD0 129 +#define MI_ALU_OP_LOAD1 1153 +#define MI_ALU_OP_ADD 256 +#define MI_ALU_OP_SUB 257 +#define MI_ALU_OP_AND 258 +#define MI_ALU_OP_OR 259 +#define MI_ALU_OP_XOR 260 +#define MI_ALU_OP_STORE 384 +#define MI_ALU_OP_STOREINV 1408 +/* sources */ +#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */ +#define MI_ALU_SRC_SRCA 32 +#define MI_ALU_SRC_SRCB 33 +#define MI_ALU_SRC_ACCU 49 +#define MI_ALU_SRC_ZF 50 +#define MI_ALU_SRC_CF 51 + /* * Commands used only by the command parser */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index 3563ce970102..a3141b79d344 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -73,6 +73,11 @@ enum intel_gt_scratch_field { /* 8 bytes */ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256, + /* 6 * 8 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048, + + /* 4 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096, }; #endif /* __INTEL_GT_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3e4f58f19362..46fca53dfbda 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3653,6 +3653,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, i915_wedged_get, i915_wedged_set, "%llu\n"); +static int +i915_perf_noa_delay_set(void *data, u64 val) +{ + struct drm_i915_private *i915 = data; + + /* This would lead to infinite waits as we're doing timestamp + * difference on the CS with only 32bits. + */ + if (val > ((1ul << 32) - 1) * RUNTIME_INFO(i915)->cs_timestamp_frequency_khz) + return -EINVAL; + + atomic64_set(&i915->perf.oa.noa_programming_delay, val); + return 0; +} + +static int +i915_perf_noa_delay_get(void *data, u64 *val) +{ + struct drm_i915_private *i915 = data; + + *val = atomic64_read(&i915->perf.oa.noa_programming_delay); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops, + i915_perf_noa_delay_get, + i915_perf_noa_delay_set, + "%llu\n"); + + #define DROP_UNBOUND BIT(0) #define DROP_BOUND BIT(1) #define DROP_RETIRE BIT(2) @@ -4418,6 +4448,7 @@ static const struct i915_debugfs_files { const char *name; const struct file_operations *fops; } i915_debugfs_files[] = { + {"i915_perf_noa_delay", &i915_perf_noa_delay_fops}, {"i915_wedged", &i915_wedged_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0419dfd0dea3..b3c6dd72c7a1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1834,6 +1834,14 @@ struct drm_i915_private { struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; + + /** + * A batch buffer doing a wait on the GPU for the NOA + * logic to be reprogrammed. + */ + struct i915_vma *noa_wait; + + atomic64_t noa_programming_delay; } oa; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3ef7d332c7c6..7ae2011cc98a 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -197,6 +197,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "gt/intel_gt.h" #include "gt/intel_lrc_reg.h" #include "i915_drv.h" @@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; config_length += oa_config->flex_regs_len * 8; } - config_length += 4; /* MI_BATCH_BUFFER_END */ + config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */ config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE); bo = i915_gem_object_create_shmem(i915, config_length); @@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len); cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len); - *cs++ = MI_BATCH_BUFFER_END; + + /* Jump into the NOA wait busy loop. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8); + *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait); + *cs++ = 0; i915_gem_object_flush_map(bo); i915_gem_object_unpin_map(bo); @@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) mutex_lock(&dev_priv->drm.struct_mutex); dev_priv->perf.oa.exclusive_stream = NULL; dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); mutex_unlock(&dev_priv->drm.struct_mutex); free_oa_buffer(dev_priv); @@ -1653,6 +1660,201 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) return ret; } +static u32 *save_restore_register(struct drm_i915_private *i915, u32 *cs, + bool save, i915_reg_t reg, u32 offset, + u32 dword_count) +{ + uint32_t d; + + for (d = 0; d < dword_count; d++) { + if (save) { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_STORE_REGISTER_MEM_GEN8 : + MI_STORE_REGISTER_MEM; + } else { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_LOAD_REGISTER_MEM_GEN8 : + MI_LOAD_REGISTER_MEM; + } + *cs++ = i915_mmio_reg_offset(reg) + 4 * d; + *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; + *cs++ = 0; + } + + return cs; +} + +static int alloc_noa_wait(struct drm_i915_private *i915) +{ + struct drm_i915_gem_object *bo; + struct i915_vma *vma; + const u64 delay_ticks = 0xffffffffffffffff - + DIV64_U64_ROUND_UP( + atomic64_read(&i915->perf.oa.noa_programming_delay) * + RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, + 1000000ull); + u32 *batch, *ts0, *cs, *jump; + int ret, i; + enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR }; + + bo = i915_gem_object_create_internal(i915, 4096); + if (IS_ERR(bo)) { + DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); + return PTR_ERR(bo); + } + + /* + * We pin in GGTT because we jump into this buffer now because + * multiple OA config BOs will have a jump to this address and it + * needs to be fixed during the lifetime of the i915/perf stream. + */ + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); + if (IS_ERR(batch)) { + ret = PTR_ERR(batch); + goto err_unpin; + } + + /* Save registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = save_restore_register( + i915, cs, true /* save */, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = save_restore_register( + i915, cs, true /* save */, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* First timestamp snapshot location. */ + ts0 = cs; + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)); + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)); + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; + *cs++ = 0; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)); + *cs++ = lower_32_bits(delay_ticks); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET)); + *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Predicate the jump. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; + *cs++ = 0; + + /* Restore registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = save_restore_register( + i915, cs, false /* save */, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = save_restore_register( + i915, cs, false /* save */, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch))); + + i915_gem_object_flush_map(bo); + i915_gem_object_unpin_map(bo); + + i915->perf.oa.noa_wait = vma; + + return 0; + +err_unpin: + __i915_vma_unpin(vma); + +err_unref: + i915_gem_object_put(bo); + + return ret; +} + static void config_oa_regs(struct drm_i915_private *dev_priv, const struct i915_oa_reg *regs, u32 n_regs) @@ -2205,6 +2407,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto err_config; } + ret = alloc_noa_wait(dev_priv); + if (ret) { + DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); + goto err_noa_wait_alloc; + } + /* PRM - observability performance counters: * * OACONTROL, performance counter enable, note: @@ -2257,6 +2465,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); + mutex_lock(&dev_priv->drm.struct_mutex); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); + mutex_unlock(&dev_priv->drm.struct_mutex); + +err_noa_wait_alloc: + put_oa_config(stream->oa_config); + err_config: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -3641,6 +3856,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv) mutex_init(&dev_priv->perf.metrics_lock); idr_init(&dev_priv->perf.metrics_idr); + atomic64_set(&dev_priv->perf.oa.noa_programming_delay, + 500 * 1000 /* 500us */); + dev_priv->perf.initialized = true; } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 95b9ca1fda2e..67cab234d322 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4) #define MI_PREDICATE_SRC1 _MMIO(0x2408) #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4) - +#define MI_PREDICATE_DATA _MMIO(0x2410) +#define MI_PREDICATE_RESULT _MMIO(0x2418) +#define MI_PREDICATE_RESULT_1 _MMIO(0x241c) #define MI_PREDICATE_RESULT_2 _MMIO(0x2214) #define LOWER_SLICE_ENABLED (1 << 0) #define LOWER_SLICE_DISABLED (0 << 0)

[v9,3/9] drm/i915/perf: implement active wait for noa configurations

Commit Message

Patch