[v5,04/10] drm/i915/perf: implement active wait for noa configurations

Message ID	20190627080045.8814-5-lionel.g.landwerlin@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Lionel Landwerlin <lionel.g.landwerlin@intel.com> To: intel-gfx@lists.freedesktop.org Date: Thu, 27 Jun 2019 11:00:39 +0300 Message-Id: <20190627080045.8814-5-lionel.g.landwerlin@intel.com> In-Reply-To: <20190627080045.8814-1-lionel.g.landwerlin@intel.com> References: <20190627080045.8814-1-lionel.g.landwerlin@intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH v5 04/10] drm/i915/perf: implement active wait for noa configurations Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	drm/i915: Vulkan performance query support \| expand [v5,00/10] drm/i915: Vulkan performance query support [v5,01/10] drm/i915/perf: add missing delay for OA muxes configuration [v5,02/10] drm/i915/perf: introduce a versioning of the i915-perf uapi [v5,03/10] drm/i915/perf: allow for CS OA configs to be created lazily [v5,04/10] drm/i915/perf: implement active wait for noa configurations [v5,05/10] drm/i915: introduce a mechanism to extend execbuf2 [v5,06/10] drm/i915: add syncobj timeline support [v5,07/10] drm/i915: add a new perf configuration execbuf parameter [v5,08/10] drm/i915/perf: allow holding preemption on filtered ctx [v5,09/10] drm/i915/perf: execute OA configuration from command stream [v5,10/10] drm/i915: add support for perf configuration queries

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index e7eff9db343e..4a66af38c87b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -151,6 +151,7 @@ #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) #define MI_BATCH_RESOURCE_STREAMER (1<<10) +#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/ /* * 3D instructions used by the kernel @@ -226,6 +227,29 @@ #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1) +#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2)) +/* operands */ +#define MI_ALU_OP_NOOP 0 +#define MI_ALU_OP_LOAD 128 +#define MI_ALU_OP_LOADINV 1152 +#define MI_ALU_OP_LOAD0 129 +#define MI_ALU_OP_LOAD1 1153 +#define MI_ALU_OP_ADD 256 +#define MI_ALU_OP_SUB 257 +#define MI_ALU_OP_AND 258 +#define MI_ALU_OP_OR 259 +#define MI_ALU_OP_XOR 260 +#define MI_ALU_OP_STORE 384 +#define MI_ALU_OP_STOREINV 1408 +/* sources */ +#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */ +#define MI_ALU_SRC_SRCA 32 +#define MI_ALU_SRC_SRCB 33 +#define MI_ALU_SRC_ACCU 49 +#define MI_ALU_SRC_ZF 50 +#define MI_ALU_SRC_CF 51 + /* * Commands used only by the command parser */ diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index eeecdad0e3ca..6b49fda145e7 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3646,6 +3646,30 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, i915_wedged_get, i915_wedged_set, "%llu\n"); +static int +i915_perf_noa_delay_set(void *data, u64 val) +{ + struct drm_i915_private *i915 = data; + + atomic64_set(&i915->perf.oa.noa_programming_delay, val); + return 0; +} + +static int +i915_perf_noa_delay_get(void *data, u64 *val) +{ + struct drm_i915_private *i915 = data; + + *val = atomic64_read(&i915->perf.oa.noa_programming_delay); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops, + i915_perf_noa_delay_get, + i915_perf_noa_delay_set, + "%llu\n"); + + #define DROP_UNBOUND BIT(0) #define DROP_BOUND BIT(1) #define DROP_RETIRE BIT(2) @@ -4411,6 +4435,7 @@ static const struct i915_debugfs_files { const char *name; const struct file_operations *fops; } i915_debugfs_files[] = { + {"i915_perf_noa_delay", &i915_perf_noa_delay_fops}, {"i915_wedged", &i915_wedged_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 333cc3c7690c..0323c8182a67 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1835,6 +1835,14 @@ struct drm_i915_private { struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; + + /** + * A batch buffer doing a wait on the GPU for the NOA + * logic to be reprogrammed. + */ + struct i915_vma *noa_wait; + + atomic64_t noa_programming_delay; } oa; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2f1dc9be3bfb..b2f5ba87921c 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -410,6 +410,8 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, size_t config_length = 0; u32 *cs; + lockdep_assert_held(&i915->drm.struct_mutex); + if (oa_config->mux_regs_len > 0) { config_length += DIV_ROUND_UP(oa_config->mux_regs_len, MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; @@ -425,7 +427,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; config_length += oa_config->flex_regs_len * 8; } - config_length += 4; /* MI_BATCH_BUFFER_END */ + config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */ config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE); bo = i915_gem_object_create_shmem(i915, config_length); @@ -442,7 +444,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len); cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len); - *cs++ = MI_BATCH_BUFFER_END; + + /* Jump into the NOA wait busy loop. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8); + *cs++ = i915->perf.oa.noa_wait->node.start; + *cs++ = 0; i915_gem_object_flush_map(bo); i915_gem_object_unpin_map(bo); @@ -1465,6 +1472,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) mutex_lock(&dev_priv->drm.struct_mutex); dev_priv->perf.oa.exclusive_stream = NULL; dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); mutex_unlock(&dev_priv->drm.struct_mutex); free_oa_buffer(dev_priv); @@ -1651,6 +1659,156 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) return ret; } +static int alloc_noa_wait(struct drm_i915_private *i915) +{ + struct drm_i915_gem_object *bo; + struct i915_vma *vma; + u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks; + u32 *batch, *cs, *jump; + int ret; + + bo = i915_gem_object_create_shmem(i915, 4096); + if (IS_ERR(bo)) { + DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); + return PTR_ERR(bo); + } + + /* + * We pin in GGTT because we jump into this buffer now because + * multiple OA config BOs will have a jump to this address and it + * needs to be fixed during the lifetime of the i915/perf stream. + */ + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); + if (IS_ERR(batch)) { + ret = PTR_ERR(batch); + goto err_unpin; + } + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)); + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)); + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(1)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(0)); + *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(2), MI_ALU_SRC_ACCU); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(3), MI_ALU_SRC_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(3)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = vma->node.start; + *cs++ = 0; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + delay_ticks = 0xffffffffffffffff - + DIV64_U64_ROUND_UP(delay_ns * + RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, + 1000000ull); + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)); + *cs++ = lower_32_bits(delay_ticks); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)) + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(2)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(4)); + *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(5), MI_ALU_SRC_CF); + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(5)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Predicate the jump. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = vma->node.start + (jump - batch) * 4; + *cs++ = 0; + + /* Clear the predicate register */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + *cs++ = 0; + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(bo); + i915_gem_object_unpin_map(bo); + + i915->perf.oa.noa_wait = vma; + + return 0; + +err_unpin: + __i915_vma_unpin(vma); + +err_unref: + i915_gem_object_put(bo); + + return ret; +} + static void config_oa_regs(struct drm_i915_private *dev_priv, const struct i915_oa_reg *regs, u32 n_regs) @@ -2219,6 +2377,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto err_config; } + ret = alloc_noa_wait(dev_priv); + if (ret) { + DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); + goto err_noa_wait_alloc; + } + /* PRM - observability performance counters: * * OACONTROL, performance counter enable, note: @@ -2271,6 +2435,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); + mutex_lock(&dev_priv->drm.struct_mutex); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); + mutex_unlock(&dev_priv->drm.struct_mutex); + +err_noa_wait_alloc: + i915_oa_config_put(stream->oa_config); + err_config: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -3642,6 +3813,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv) mutex_init(&dev_priv->perf.metrics_lock); idr_init(&dev_priv->perf.metrics_idr); + atomic64_set(&dev_priv->perf.oa.noa_programming_delay, + 500 * 1000 /* 500us */); + dev_priv->perf.initialized = true; } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 7e6009cefb18..e12b2fccef70 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4) #define MI_PREDICATE_SRC1 _MMIO(0x2408) #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4) - +#define MI_PREDICATE_DATA _MMIO(0x2410) +#define MI_PREDICATE_RESULT _MMIO(0x2418) +#define MI_PREDICATE_RESULT_1 _MMIO(0x241c) #define MI_PREDICATE_RESULT_2 _MMIO(0x2214) #define LOWER_SLICE_ENABLED (1 << 0) #define LOWER_SLICE_DISABLED (0 << 0)

[v5,04/10] drm/i915/perf: implement active wait for noa configurations

Commit Message

Comments

Patch