[15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples

Message ID	1461324845-25755-16-git-send-email-sourab.gupta@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: sourab.gupta@intel.com To: intel-gfx@lists.freedesktop.org Date: Fri, 22 Apr 2016 17:04:04 +0530 Message-Id: <1461324845-25755-16-git-send-email-sourab.gupta@intel.com> In-Reply-To: <1461324845-25755-1-git-send-email-sourab.gupta@intel.com> References: <1461324845-25755-1-git-send-email-sourab.gupta@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>, Sourab Gupta <sourab.gupta@intel.com>, Deepak S <deepak.s@intel.com> Subject: [Intel-gfx] [PATCH 15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0923a17..e6a1a93 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1749,6 +1749,9 @@ struct i915_perf_stream { /* Whether the OA unit is in use */ bool using_oa; + /* monotonic clk ts for last sample */ + u64 last_sample_ts; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED. @@ -2144,6 +2147,14 @@ struct drm_i915_private { struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + /* Timekeeping Info */ + u64 clk_mono; /* last monotonic clk value */ + u64 gpu_time; /* last gpu time value */ + s64 clk_offset; /* Offset between clk mono and gpu time */ + u32 timestamp_frequency; + u32 resync_period; /* in msecs */ + struct delayed_work clk_sync_work; + struct { u32 specific_ctx_id; @@ -2152,7 +2163,6 @@ struct drm_i915_private { bool periodic; int period_exponent; - int timestamp_frequency; int tail_margin; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index abb9d04..af9ec93 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -61,6 +61,12 @@ #define POLL_FREQUENCY 200 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +/* Max period for clock synchronization. Defined as 25 seconds, as this is seen + * to give best results. + */ +#define MAX_CLK_SYNC_PERIOD (25*MSEC_PER_SEC) +#define INIT_CLK_SYNC_PERIOD (20) /* in msecs */ + static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -93,7 +99,8 @@ struct sample_data { u32 ctx_id; u32 pid; u32 tag; - u64 ts; + u64 gpu_ts; + u64 clk_mono; const u8 *report; }; @@ -142,6 +149,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) +#define SAMPLE_CLK_MONO (1<<6) struct perf_open_properties { u32 sample_flags; @@ -232,7 +240,7 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv, if (stream->sample_flags & SAMPLE_OA_REPORT) entry_size += dev_priv->perf.oa.oa_buffer.format_size; - else if (sample_flags & SAMPLE_TS) { + else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) { /* * XXX: Since TS data can anyways be derived from OA report, so * no need to capture it for RCS engine, if capture oa data is @@ -501,7 +509,7 @@ static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream, ret = i915_ring_stream_capture_oa(req, entry->oa_offset); if (ret) goto err_unref; - } else if (sample_flags & SAMPLE_TS) { + } else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) { /* * XXX: Since TS data can anyways be derived from OA report, so * no need to capture it for RCS engine, if capture oa data is @@ -758,7 +766,13 @@ static int append_sample(struct i915_perf_stream *stream, } if (sample_flags & SAMPLE_TS) { - if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE)) + if (copy_to_user(buf, &data->gpu_ts, I915_PERF_TS_SAMPLE_SIZE)) + return -EFAULT; + buf += I915_PERF_TS_SAMPLE_SIZE; + } + + if (sample_flags & SAMPLE_CLK_MONO) { + if (copy_to_user(buf, &data->clk_mono, I915_PERF_TS_SAMPLE_SIZE)) return -EFAULT; buf += I915_PERF_TS_SAMPLE_SIZE; } @@ -775,6 +789,40 @@ static int append_sample(struct i915_perf_stream *stream, return 0; } +static u64 get_current_gpu_ts(struct drm_i915_private *dev_priv) +{ + return ((u64)I915_READ(GT_TIMESTAMP_COUNT_UDW) << 32) | + I915_READ(GT_TIMESTAMP_COUNT); +} + +static u64 get_clk_mono_from_gpu_ts(struct i915_perf_stream *stream, + u64 gpu_ts) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + u64 remainder, ts_interval = NSEC_PER_SEC; + u64 gpu_freq = dev_priv->perf.timestamp_frequency; + u64 gpu_time, clk_mono; + + remainder = do_div(ts_interval, gpu_freq); + + remainder *= gpu_ts; + do_div(remainder, gpu_freq); + gpu_time = (ts_interval*gpu_ts) + remainder; + + clk_mono = gpu_time - dev_priv->perf.clk_offset; + + /* Ensure monotonicity by clamping the system time if it tries to + * go backwards. This may happen during re-syncing clocks, when the + * gpu clock is faster. + * FIXME: Any other mechanism to ensure monotonicity? + */ + if (clk_mono < stream->last_sample_ts) + clk_mono = stream->last_sample_ts; + + stream->last_sample_ts = clk_mono; + return clk_mono; +} + static u64 get_gpu_ts_from_oa_report(struct drm_i915_private *dev_priv, const u8 *report) { @@ -831,7 +879,13 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, /* Derive timestamp from OA report */ if (sample_flags & SAMPLE_TS) - data.ts = get_gpu_ts_from_oa_report(dev_priv, report); + data.gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); + + if (sample_flags & SAMPLE_CLK_MONO) { + u64 gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); + + data.clk_mono = get_clk_mono_from_gpu_ts(stream, gpu_ts); + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1261,7 +1315,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, if (ret) return ret; - if (sample_flags & SAMPLE_TS) + if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); } @@ -1283,7 +1337,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, dev_priv->perf.last_tag = node->tag; } - if (sample_flags & SAMPLE_TS) { + if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) { /* If OA sampling is enabled, derive the ts from OA report. * Else, forward the timestamp collected via command stream. */ @@ -1291,7 +1345,12 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, gpu_ts = *(u64 *) (dev_priv->perf.command_stream_buf[id].addr + node->ts_offset); - data.ts = gpu_ts; + + if (sample_flags & SAMPLE_TS) + data.gpu_ts = gpu_ts; + if (sample_flags & SAMPLE_CLK_MONO) + data.clk_mono = get_clk_mono_from_gpu_ts(stream, + gpu_ts); } return append_sample(stream, read_state, &data); @@ -2039,17 +2098,118 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv) GEN8_OA_COUNTER_ENABLE); } +static void i915_perf_get_clock(struct drm_i915_private *dev_priv, + u64 *clk_mono, u64 *gpu_time, u64 *gpu_ts) +{ + u64 remainder, ts_interval = NSEC_PER_SEC; + u32 gpu_freq = dev_priv->perf.timestamp_frequency; + unsigned long flags; + + local_irq_save(flags); + *clk_mono = ktime_get_mono_fast_ns(); + *gpu_ts = get_current_gpu_ts(dev_priv); + local_irq_restore(flags); + + remainder = do_div(ts_interval, gpu_freq); + remainder *= *gpu_ts; + do_div(remainder, gpu_freq); + + *gpu_time = ((*gpu_ts) * ts_interval) + remainder; +} + +static void i915_perf_clock_sync_work(struct work_struct *work) +{ + struct drm_i915_private *dev_priv = + container_of(work, typeof(*dev_priv), perf.clk_sync_work.work); + u64 last_clk_mono = dev_priv->perf.clk_mono; + u64 last_gpu_time = dev_priv->perf.gpu_time; + u64 clk_mono, clk_mono_offset, gpu_time, gpu_time_offset, gpu_ts; + u64 gpu_freq = dev_priv->perf.timestamp_frequency; + u64 remainder, ts_interval = NSEC_PER_SEC; + s64 delta, freq_delta; + + i915_perf_get_clock(dev_priv, &clk_mono, &gpu_time, &gpu_ts); + + clk_mono_offset = clk_mono - last_clk_mono; + gpu_time_offset = gpu_time - last_gpu_time; + + /* delta time in ns */ + delta = gpu_time_offset - clk_mono_offset; + + /* If time delta < 1 us, we can assume gpu frequency is correct */ + if (abs(delta) < NSEC_PER_USEC) + goto out; + + /* The two clocks shouldn't deviate more than 1 second during the + * resync period. If this is the case (which may happen due to + * suspend/resume), then don't apply frequency correction, and + * fast forward/rewind the clocks to resync immediately + */ + if (abs(delta) > NSEC_PER_SEC) + goto out; + + /* Calculate frequency delta */ + freq_delta = abs(delta)*gpu_freq; + do_div(freq_delta, clk_mono_offset); + + if (delta < 0) + freq_delta = -freq_delta; + + dev_priv->perf.timestamp_frequency += freq_delta; + + /* + * Calculate updated gpu_time based on corrected frequency. + * Note that this may cause jumps in gpu time depending on whether + * frequency delta is positive or negative. + * NB: Take care that monotonicity of sample timestamps is maintained + * even with these jumps. + */ + gpu_freq = dev_priv->perf.timestamp_frequency; + remainder = do_div(ts_interval, gpu_freq); + + remainder *= gpu_ts; + do_div(remainder, gpu_freq); + gpu_time = (ts_interval*gpu_ts) + remainder; + +out: + dev_priv->perf.clk_mono = clk_mono; + dev_priv->perf.gpu_time = gpu_time; + dev_priv->perf.clk_offset = dev_priv->perf.gpu_time - + dev_priv->perf.clk_mono; + + /* We can schedule next synchronization at incrementally higher + * durations, so that the accuracy of our calculated frequency + * can improve over time. The max resync period is arbitrarily + * set as one hour. + */ + dev_priv->perf.resync_period *= 2; + if (dev_priv->perf.resync_period < MAX_CLK_SYNC_PERIOD) + schedule_delayed_work(&dev_priv->perf.clk_sync_work, + msecs_to_jiffies(dev_priv->perf.resync_period)); +} + static void i915_ring_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; if (stream->sample_flags & SAMPLE_OA_REPORT) { - dev_priv->perf.oa.last_gpu_ts = - ((u64)I915_READ(GT_TIMESTAMP_COUNT_UDW) << 32) | - I915_READ(GT_TIMESTAMP_COUNT); + dev_priv->perf.oa.last_gpu_ts = get_current_gpu_ts(dev_priv); dev_priv->perf.oa.ops.oa_enable(dev_priv); } + if (stream->sample_flags & SAMPLE_CLK_MONO) { + u64 gpu_ts; + + i915_perf_get_clock(dev_priv, &dev_priv->perf.clk_mono, + &dev_priv->perf.gpu_time, &gpu_ts); + dev_priv->perf.clk_offset = dev_priv->perf.gpu_time - + dev_priv->perf.clk_mono; + + if (dev_priv->perf.resync_period < MAX_CLK_SYNC_PERIOD) + schedule_delayed_work(&dev_priv->perf.clk_sync_work, + msecs_to_jiffies(dev_priv->perf.resync_period)); + } + if (stream->cs_mode) stream->command_stream_hook = i915_ring_stream_cs_hook; @@ -2073,6 +2233,8 @@ static void i915_ring_stream_disable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; + cancel_delayed_work_sync(&dev_priv->perf.clk_sync_work); + if (stream->cs_mode || dev_priv->perf.oa.periodic) hrtimer_cancel(&dev_priv->perf.poll_check_timer); @@ -2089,7 +2251,7 @@ static void i915_ring_stream_disable(struct i915_perf_stream *stream) static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent) { return 1000000000ULL * (2ULL << exponent) / - dev_priv->perf.oa.timestamp_frequency; + dev_priv->perf.timestamp_frequency; } static int i915_ring_stream_init(struct i915_perf_stream *stream, @@ -2102,7 +2264,8 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, bool require_cs_mode = props->sample_flags & (SAMPLE_PID | SAMPLE_TAG); bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT | - SAMPLE_TS); + SAMPLE_TS | + SAMPLE_CLK_MONO); int ret; if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) { @@ -2249,6 +2412,19 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, require_cs_mode = true; } + if (props->sample_flags & SAMPLE_CLK_MONO) { + stream->sample_flags |= SAMPLE_CLK_MONO; + stream->sample_size += I915_PERF_TS_SAMPLE_SIZE; + + /* + * NB: it's meaningful to request SAMPLE_CLK_MONO with just CS + * mode or periodic OA mode sampling but we don't allow + * SAMPLE_CLK_MONO without either mode + */ + if (!require_oa_unit) + require_cs_mode = true; + } + if (require_cs_mode && !props->cs_mode) { DRM_ERROR( "PID, TAG or TS sampling require a ring to be specified"); @@ -2273,11 +2449,13 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, /* * The only time we should allow enabling CS mode if it's not - * strictly required, is if SAMPLE_CTX_ID or SAMPLE_TS has been - * requested, as they're usable with periodic OA or CS sampling. + * strictly required, is if SAMPLE_CTX_ID, SAMPLE_TS, or + * SAMPLE_CLK_MONO has been requested, as they're usable with + * periodic OA or CS sampling. */ if (!require_cs_mode && - !(props->sample_flags & (SAMPLE_CTX_ID|SAMPLE_TS))) { + !(props->sample_flags & + (SAMPLE_CTX_ID|SAMPLE_TS|SAMPLE_CLK_MONO))) { DRM_ERROR( "Ring given without requesting any CS specific property"); ret = -EINVAL; @@ -2955,6 +3133,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_PROP_SAMPLE_TS: props->sample_flags |= SAMPLE_TS; break; + case DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC: + props->sample_flags |= SAMPLE_CLK_MONO; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } @@ -3072,8 +3253,11 @@ void i915_perf_init(struct drm_device *dev) mutex_init(&dev_priv->perf.streams_lock); spin_lock_init(&dev_priv->perf.hook_lock); - dev_priv->perf.oa.timestamp_frequency = + dev_priv->perf.timestamp_frequency = GT_CS_TIMESTAMP_FREQUENCY(dev_priv); + dev_priv->perf.resync_period = INIT_CLK_SYNC_PERIOD; + INIT_DELAYED_WORK(&dev_priv->perf.clk_sync_work, + i915_perf_clock_sync_work); if (IS_HASWELL(dev)) { dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2584c0b..4f1b987 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3288,6 +3288,16 @@ enum skl_disp_power_wells { INTERVAL_1_33_US(us)) : \ INTERVAL_1_28_US(us)) +/* GT CS timestamp frequency */ +#define FREQUENCY_12_5_MHZ (12500000) +#define FREQUENCY_12_0_MHZ (12000000) +#define FREQUENCY_19_2_MHZ (19200000) +#define GT_CS_TIMESTAMP_FREQUENCY(dev_priv) (IS_GEN9(dev_priv) ? \ + (IS_BROXTON(dev_priv) ? \ + FREQUENCY_19_2_MHZ : \ + FREQUENCY_12_0_MHZ) : \ + FREQUENCY_12_5_MHZ) + /* * Logical Context regs */ diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 072ff08..a564a05 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1278,6 +1278,12 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_SAMPLE_TS, + /** + * This property requests inclusion of CLOCK_MONOTONIC system time in + * the perf sample data. + */ + DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1346,7 +1352,8 @@ enum drm_i915_perf_record_type { * { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID * { u32 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID * { u32 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG - * { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS + * { u64 gpu_ts; } && DRM_I915_PERF_PROP_SAMPLE_TS + * { u64 clk_mono; } && DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC * { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA * }; */

[15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples

Commit Message

Comments

Patch