[v12,07/11] drm/i915/perf: implement active wait for noa configurations

Message ID	20190830144726.18291-8-lionel.g.landwerlin@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=SCAt=W2=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org DEAD623426 From: Lionel Landwerlin <lionel.g.landwerlin@intel.com> To: intel-gfx@lists.freedesktop.org Date: Fri, 30 Aug 2019 17:47:22 +0300 Message-Id: <20190830144726.18291-8-lionel.g.landwerlin@intel.com> In-Reply-To: <20190830144726.18291-1-lionel.g.landwerlin@intel.com> References: <20190830144726.18291-1-lionel.g.landwerlin@intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH v12 07/11] drm/i915/perf: implement active wait for noa configurations Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	drm/i915: Vulkan performance query support \| expand [v12,00/11] drm/i915: Vulkan performance query support [v12,01/11] drm/i915: introduce a mechanism to extend execbuf2 [v12,02/11] drm/i915: add syncobj timeline support [v12,03/11] drm/i915/perf: drop list of streams [v12,04/11] drm/i915/perf: store the associated engine of a stream [v12,05/11] drm/i915/perf: introduce a versioning of the i915-perf uapi [v12,06/11] drm/i915/perf: allow for CS OA configs to be created lazily [v12,07/11] drm/i915/perf: implement active wait for noa configurations [v12,08/11] drm/i915/perf: execute OA configuration from command stream [v12,09/11] drm/i915: add a new perf configuration execbuf parameter [v12,10/11] drm/i915/perf: allow holding preemption on filtered ctx [v12,11/11] drm/i915: add support for perf configuration queries

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index a7f1377a54a2..f133f8dbacb1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -158,6 +158,7 @@ #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) #define MI_BATCH_RESOURCE_STREAMER (1<<10) +#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/ /* * 3D instructions used by the kernel @@ -236,6 +237,29 @@ #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1) +#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2)) +/* operands */ +#define MI_ALU_OP_NOOP 0 +#define MI_ALU_OP_LOAD 128 +#define MI_ALU_OP_LOADINV 1152 +#define MI_ALU_OP_LOAD0 129 +#define MI_ALU_OP_LOAD1 1153 +#define MI_ALU_OP_ADD 256 +#define MI_ALU_OP_SUB 257 +#define MI_ALU_OP_AND 258 +#define MI_ALU_OP_OR 259 +#define MI_ALU_OP_XOR 260 +#define MI_ALU_OP_STORE 384 +#define MI_ALU_OP_STOREINV 1408 +/* sources */ +#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */ +#define MI_ALU_SRC_SRCA 32 +#define MI_ALU_SRC_SRCB 33 +#define MI_ALU_SRC_ACCU 49 +#define MI_ALU_SRC_ZF 50 +#define MI_ALU_SRC_CF 51 + /* * Commands used only by the command parser */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index dc295c196d11..f752b6cf9ea1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -97,6 +97,11 @@ enum intel_gt_scratch_field { /* 8 bytes */ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256, + /* 6 * 8 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048, + + /* 4 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096, }; #endif /* __INTEL_GT_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 5e81c4fc13ae..4234bf133903 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3577,6 +3577,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, i915_wedged_get, i915_wedged_set, "%llu\n"); +static int +i915_perf_noa_delay_set(void *data, u64 val) +{ + struct drm_i915_private *i915 = data; + + /* This would lead to infinite waits as we're doing timestamp + * difference on the CS with only 32bits. + */ + if (val > mul_u32_u32(U32_MAX, RUNTIME_INFO(i915)->cs_timestamp_frequency_khz)) + return -EINVAL; + + atomic64_set(&i915->perf.noa_programming_delay, val); + return 0; +} + +static int +i915_perf_noa_delay_get(void *data, u64 *val) +{ + struct drm_i915_private *i915 = data; + + *val = atomic64_read(&i915->perf.noa_programming_delay); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops, + i915_perf_noa_delay_get, + i915_perf_noa_delay_set, + "%llu\n"); + + #define DROP_UNBOUND BIT(0) #define DROP_BOUND BIT(1) #define DROP_RETIRE BIT(2) @@ -4353,6 +4383,7 @@ static const struct i915_debugfs_files { const char *name; const struct file_operations *fops; } i915_debugfs_files[] = { + {"i915_perf_noa_delay", &i915_perf_noa_delay_fops}, {"i915_wedged", &i915_wedged_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c0ff6f0fd33e..20b6a7023097 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1223,6 +1223,12 @@ struct i915_perf_stream { */ u32 head; } oa_buffer; + + /** + * A batch buffer doing a wait on the GPU for the NOA logic to be + * reprogrammed. + */ + struct i915_vma *noa_wait; }; /** @@ -1714,6 +1720,8 @@ struct drm_i915_private { struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; + + atomic64_t noa_programming_delay; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 08869660f1f2..d0764852b347 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -197,6 +197,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "gt/intel_gt.h" #include "gt/intel_lrc_reg.h" #include "i915_drv.h" @@ -408,6 +409,7 @@ static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_r } static struct i915_oa_config_bo* alloc_oa_config_buffer(struct drm_i915_private *i915, + struct i915_vma *noa_wait, struct i915_oa_config *oa_config) { struct i915_oa_config_bo *oa_bo; @@ -436,7 +438,7 @@ static struct i915_oa_config_bo* alloc_oa_config_buffer(struct drm_i915_private MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; config_length += oa_config->flex_regs_len * 8; } - config_length += 4; /* MI_BATCH_BUFFER_END */ + config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */ config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE); oa_bo->bo = i915_gem_object_create_shmem(i915, config_length); @@ -455,7 +457,12 @@ static struct i915_oa_config_bo* alloc_oa_config_buffer(struct drm_i915_private cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len); cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len); - *cs++ = MI_BATCH_BUFFER_END; + + /* Jump into the NOA wait busy loop. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8); + *cs++ = i915_ggtt_offset(noa_wait); + *cs++ = 0; i915_gem_object_flush_map(oa_bo->bo); i915_gem_object_unpin_map(oa_bo->bo); @@ -550,7 +557,9 @@ int i915_perf_get_oa_config_and_bo(struct i915_perf_stream *stream, mutex_unlock(&stream->config_mutex); if (!oa_bo) { - oa_bo = alloc_oa_config_buffer(i915, oa_config); + oa_bo = alloc_oa_config_buffer(i915, + stream->noa_wait, + oa_config); if (IS_ERR(oa_bo)) { err = PTR_ERR(oa_bo); goto err; @@ -1524,6 +1533,16 @@ free_oa_buffer(struct i915_perf_stream *stream) stream->oa_buffer.vaddr = NULL; } +static void +free_noa_wait(struct i915_perf_stream *stream) +{ + struct drm_i915_private *i915 = stream->dev_priv; + + mutex_lock(&i915->drm.struct_mutex); + i915_vma_unpin_and_release(&stream->noa_wait, 0); + mutex_unlock(&i915->drm.struct_mutex); +} + static void free_oa_configs(struct i915_perf_stream *stream) { @@ -1552,6 +1571,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) mutex_unlock(&dev_priv->drm.struct_mutex); free_oa_buffer(stream); + free_noa_wait(stream); intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); @@ -1731,6 +1751,202 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream) return ret; } +static u32 *save_restore_register(struct drm_i915_private *i915, u32 *cs, + bool save, i915_reg_t reg, u32 offset, + u32 dword_count) +{ + uint32_t d; + + for (d = 0; d < dword_count; d++) { + if (save) { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_STORE_REGISTER_MEM_GEN8 : + MI_STORE_REGISTER_MEM; + } else { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_LOAD_REGISTER_MEM_GEN8 : + MI_LOAD_REGISTER_MEM; + } + *cs++ = i915_mmio_reg_offset(reg) + 4 * d; + *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; + *cs++ = 0; + } + + return cs; +} + +static int alloc_noa_wait(struct i915_perf_stream *stream) +{ + struct drm_i915_private *i915 = stream->dev_priv; + struct drm_i915_gem_object *bo; + struct i915_vma *vma; + const u64 delay_ticks = 0xffffffffffffffff - + DIV64_U64_ROUND_UP( + atomic64_read(&i915->perf.noa_programming_delay) * + RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, + 1000000ull); + u32 *batch, *ts0, *cs, *jump; + int ret, i; + enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR }; + + bo = i915_gem_object_create_internal(i915, 4096); + if (IS_ERR(bo)) { + DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); + return PTR_ERR(bo); + } + + /* + * We pin in GGTT because we jump into this buffer now because + * multiple OA config BOs will have a jump to this address and it + * needs to be fixed during the lifetime of the i915/perf stream. + */ + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); + if (IS_ERR(batch)) { + ret = PTR_ERR(batch); + goto err_unpin; + } + + /* Save registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = save_restore_register( + i915, cs, true /* save */, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = save_restore_register( + i915, cs, true /* save */, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* First timestamp snapshot location. */ + ts0 = cs; + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)); + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)); + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; + *cs++ = 0; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)); + *cs++ = lower_32_bits(delay_ticks); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET)); + *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Predicate the jump. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; + *cs++ = 0; + + /* Restore registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = save_restore_register( + i915, cs, false /* save */, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = save_restore_register( + i915, cs, false /* save */, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch))); + + i915_gem_object_flush_map(bo); + i915_gem_object_unpin_map(bo); + + stream->noa_wait = vma; + + return 0; + +err_unpin: + __i915_vma_unpin(vma); + +err_unref: + i915_gem_object_put(bo); + + return ret; +} + static void config_oa_regs(struct drm_i915_private *dev_priv, const struct i915_oa_reg *regs, u32 n_regs) @@ -2403,6 +2619,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, } } + ret = alloc_noa_wait(stream); + if (ret) { + DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); + goto err_noa_wait_alloc; + } + ret = i915_perf_get_oa_config(dev_priv, props->metrics_set, &stream->oa_config); if (ret) { @@ -2471,6 +2693,9 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, free_oa_configs(stream); err_config: + free_noa_wait(stream); + +err_noa_wait_alloc: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -3855,6 +4080,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv) ratelimit_set_flags(&dev_priv->perf.spurious_report_rs, RATELIMIT_MSG_ON_RELEASE); + atomic64_set(&dev_priv->perf.noa_programming_delay, + 500 * 1000 /* 500us */); + dev_priv->perf.initialized = true; } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 3cfdab18c0cf..ddb73678b2b7 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -545,7 +545,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4) #define MI_PREDICATE_SRC1 _MMIO(0x2408) #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4) - +#define MI_PREDICATE_DATA _MMIO(0x2410) +#define MI_PREDICATE_RESULT _MMIO(0x2418) +#define MI_PREDICATE_RESULT_1 _MMIO(0x241c) #define MI_PREDICATE_RESULT_2 _MMIO(0x2214) #define LOWER_SLICE_ENABLED (1 << 0) #define LOWER_SLICE_DISABLED (0 << 0)

[v12,07/11] drm/i915/perf: implement active wait for noa configurations

Commit Message

Patch