Message ID | 20180530115539.27556-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 30/05/2018 12:55, Chris Wilson wrote: > hrtimer is not reliable enough to assume fixed intervals, and so even > coarse accuracy (in the face of kasan and similar heavy debugging) we > need to measure the actual interval between sample. > > While using a single timestamp to compute the interval does not allow > very fine accuracy (consider the impact of a slow forcewake between > different samples after the timestamp is read) is much better than > assuming the interval. > > v2: Make use of the sample period for estimating the GPU clock cycles, > leaving the frequency calculation (the averaging) to the caller. > Introduce new samplers for reporting cycles instead of Hz. > > Testcase: igt/perf_pmu #ivb > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- > drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ > include/uapi/drm/i915_drm.h | 2 ++ > 3 files changed, 43 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > index dc87797db500..12033e47e3b4 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.c > +++ b/drivers/gpu/drm/i915/i915_pmu.c > @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > */ > enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | > config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | > + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > ENGINE_SAMPLE_MASK; > > /* > @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) > { > if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { > i915->pmu.timer_enabled = true; > + i915->pmu.timestamp = ktime_get(); > hrtimer_start_range_ns(&i915->pmu.timer, > ns_to_ktime(PERIOD), 0, > HRTIMER_MODE_REL_PINNED); > @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) > sample->cur += mul_u32_u32(val, unit); > } > > -static void engines_sample(struct drm_i915_private *dev_priv) > +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) > { > struct intel_engine_cs *engine; > enum intel_engine_id id; > @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) > val = !i915_seqno_passed(current_seqno, last_seqno); > > update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], > - PERIOD, val); > + period, val); > > if (val && (engine->pmu.enable & > (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { > @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) > } > > update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], > - PERIOD, !!(val & RING_WAIT)); > + period, !!(val & RING_WAIT)); > > update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], > - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); > + period, !!(val & RING_WAIT_SEMAPHORE)); > } > > if (fw) > @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) > intel_runtime_pm_put(dev_priv); > } > > -static void frequency_sample(struct drm_i915_private *dev_priv) > +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) > { > if (dev_priv->pmu.enable & > - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { > + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { > u32 val; > > val = dev_priv->gt_pm.rps.cur_freq; > @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) > > update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], > 1, intel_gpu_freq(dev_priv, val)); > + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], > + period, intel_gpu_freq(dev_priv, val)); Cache intel_gpu_freq in a local. > } > > if (dev_priv->pmu.enable & > - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { > + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { > update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, > intel_gpu_freq(dev_priv, > dev_priv->gt_pm.rps.cur_freq)); > + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], > + period, > + intel_gpu_freq(dev_priv, > + dev_priv->gt_pm.rps.cur_freq)); Same here. > } > } > > @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) > { > struct drm_i915_private *i915 = > container_of(hrtimer, struct drm_i915_private, pmu.timer); > + ktime_t now, period; > > if (!READ_ONCE(i915->pmu.timer_enabled)) > return HRTIMER_NORESTART; > > - engines_sample(i915); > - frequency_sample(i915); > + now = ktime_get(); > + period = ktime_sub(now, i915->pmu.timestamp); > + i915->pmu.timestamp = now; > + > + engines_sample(i915, period); > + frequency_sample(i915, period); > > hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); > return HRTIMER_RESTART; > @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) > { > switch (config) { > case I915_PMU_ACTUAL_FREQUENCY: > + case I915_PMU_ACTUAL_CLOCK: > if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) > /* Requires a mutex for sampling! */ > return -ENODEV; > /* Fall-through. */ > case I915_PMU_REQUESTED_FREQUENCY: > + case I915_PMU_REQUESTED_CLOCK: > if (INTEL_GEN(i915) < 6) > return -ENODEV; > break; > @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, > FREQUENCY); > break; > + case I915_PMU_ACTUAL_CLOCK: > + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; > + break; > + case I915_PMU_REQUESTED_CLOCK: > + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; > + break; > case I915_PMU_INTERRUPTS: > val = count_interrupts(i915); > break; > @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) > __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), > __event(I915_PMU_INTERRUPTS, "interrupts", NULL), > __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), > + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), > + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), > }; > static const struct { > enum drm_i915_pmu_engine_sample sample; > diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h > index 2ba735299f7c..9c4252d85b5e 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.h > +++ b/drivers/gpu/drm/i915/i915_pmu.h > @@ -17,6 +17,8 @@ struct drm_i915_private; > enum { > __I915_SAMPLE_FREQ_ACT = 0, > __I915_SAMPLE_FREQ_REQ, > + __I915_SAMPLE_CLOCK_ACT, > + __I915_SAMPLE_CLOCK_REQ, > __I915_SAMPLE_RC6, > __I915_SAMPLE_RC6_ESTIMATED, > __I915_NUM_PMU_SAMPLERS > @@ -52,6 +54,10 @@ struct i915_pmu { > * @timer: Timer for internal i915 PMU sampling. > */ > struct hrtimer timer; > + /** > + * @timestamp: Timestamp of last internal i915 PMU sampling. > + */ > + ktime_t timestamp; > /** > * @enable: Bitmask of all currently enabled events. > * > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 7f5634ce8e88..61ab71986274 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { > #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) > #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) > #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) > +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) > +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) > > #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY Bump this one. I want to know if we could get away without introducing a new pair of counter. For instance would running average of a period do for frequency readout? It depends on what kind of error we are facing. Or a moving average for some period? I would explore that but don't have an Ivybridge so could you have a look? Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-05-30 15:37:18) > > On 30/05/2018 12:55, Chris Wilson wrote: > > hrtimer is not reliable enough to assume fixed intervals, and so even > > coarse accuracy (in the face of kasan and similar heavy debugging) we > > need to measure the actual interval between sample. > > > > While using a single timestamp to compute the interval does not allow > > very fine accuracy (consider the impact of a slow forcewake between > > different samples after the timestamp is read) is much better than > > assuming the interval. > > > > v2: Make use of the sample period for estimating the GPU clock cycles, > > leaving the frequency calculation (the averaging) to the caller. > > Introduce new samplers for reporting cycles instead of Hz. > > > > Testcase: igt/perf_pmu #ivb > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > --- > > drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- > > drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ > > include/uapi/drm/i915_drm.h | 2 ++ > > 3 files changed, 43 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > > index dc87797db500..12033e47e3b4 100644 > > --- a/drivers/gpu/drm/i915/i915_pmu.c > > +++ b/drivers/gpu/drm/i915/i915_pmu.c > > @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > > */ > > enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | > > config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | > > + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > > + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > > ENGINE_SAMPLE_MASK; > > > > /* > > @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) > > { > > if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { > > i915->pmu.timer_enabled = true; > > + i915->pmu.timestamp = ktime_get(); > > hrtimer_start_range_ns(&i915->pmu.timer, > > ns_to_ktime(PERIOD), 0, > > HRTIMER_MODE_REL_PINNED); > > @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) > > sample->cur += mul_u32_u32(val, unit); > > } > > > > -static void engines_sample(struct drm_i915_private *dev_priv) > > +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) > > { > > struct intel_engine_cs *engine; > > enum intel_engine_id id; > > @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) > > val = !i915_seqno_passed(current_seqno, last_seqno); > > > > update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], > > - PERIOD, val); > > + period, val); > > > > if (val && (engine->pmu.enable & > > (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { > > @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) > > } > > > > update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], > > - PERIOD, !!(val & RING_WAIT)); > > + period, !!(val & RING_WAIT)); > > > > update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], > > - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); > > + period, !!(val & RING_WAIT_SEMAPHORE)); > > } > > > > if (fw) > > @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) > > intel_runtime_pm_put(dev_priv); > > } > > > > -static void frequency_sample(struct drm_i915_private *dev_priv) > > +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) > > { > > if (dev_priv->pmu.enable & > > - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { > > + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > > + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { > > u32 val; > > > > val = dev_priv->gt_pm.rps.cur_freq; > > @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) > > > > update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], > > 1, intel_gpu_freq(dev_priv, val)); > > + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], > > + period, intel_gpu_freq(dev_priv, val)); > > Cache intel_gpu_freq in a local. > > > } > > > > if (dev_priv->pmu.enable & > > - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { > > + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > > + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { > > update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, > > intel_gpu_freq(dev_priv, > > dev_priv->gt_pm.rps.cur_freq)); > > + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], > > + period, > > + intel_gpu_freq(dev_priv, > > + dev_priv->gt_pm.rps.cur_freq)); > > Same here. > > > } > > } > > > > @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) > > { > > struct drm_i915_private *i915 = > > container_of(hrtimer, struct drm_i915_private, pmu.timer); > > + ktime_t now, period; > > > > if (!READ_ONCE(i915->pmu.timer_enabled)) > > return HRTIMER_NORESTART; > > > > - engines_sample(i915); > > - frequency_sample(i915); > > + now = ktime_get(); > > + period = ktime_sub(now, i915->pmu.timestamp); > > + i915->pmu.timestamp = now; > > + > > + engines_sample(i915, period); > > + frequency_sample(i915, period); > > > > hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); > > return HRTIMER_RESTART; > > @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) > > { > > switch (config) { > > case I915_PMU_ACTUAL_FREQUENCY: > > + case I915_PMU_ACTUAL_CLOCK: > > if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) > > /* Requires a mutex for sampling! */ > > return -ENODEV; > > /* Fall-through. */ > > case I915_PMU_REQUESTED_FREQUENCY: > > + case I915_PMU_REQUESTED_CLOCK: > > if (INTEL_GEN(i915) < 6) > > return -ENODEV; > > break; > > @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > > div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, > > FREQUENCY); > > break; > > + case I915_PMU_ACTUAL_CLOCK: > > + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; > > + break; > > + case I915_PMU_REQUESTED_CLOCK: > > + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; > > + break; > > case I915_PMU_INTERRUPTS: > > val = count_interrupts(i915); > > break; > > @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) > > __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), > > __event(I915_PMU_INTERRUPTS, "interrupts", NULL), > > __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), > > + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), > > + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), > > }; > > static const struct { > > enum drm_i915_pmu_engine_sample sample; > > diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h > > index 2ba735299f7c..9c4252d85b5e 100644 > > --- a/drivers/gpu/drm/i915/i915_pmu.h > > +++ b/drivers/gpu/drm/i915/i915_pmu.h > > @@ -17,6 +17,8 @@ struct drm_i915_private; > > enum { > > __I915_SAMPLE_FREQ_ACT = 0, > > __I915_SAMPLE_FREQ_REQ, > > + __I915_SAMPLE_CLOCK_ACT, > > + __I915_SAMPLE_CLOCK_REQ, > > __I915_SAMPLE_RC6, > > __I915_SAMPLE_RC6_ESTIMATED, > > __I915_NUM_PMU_SAMPLERS > > @@ -52,6 +54,10 @@ struct i915_pmu { > > * @timer: Timer for internal i915 PMU sampling. > > */ > > struct hrtimer timer; > > + /** > > + * @timestamp: Timestamp of last internal i915 PMU sampling. > > + */ > > + ktime_t timestamp; > > /** > > * @enable: Bitmask of all currently enabled events. > > * > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > > index 7f5634ce8e88..61ab71986274 100644 > > --- a/include/uapi/drm/i915_drm.h > > +++ b/include/uapi/drm/i915_drm.h > > @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { > > #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) > > #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) > > #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) > > +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) > > +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) > > > > #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY > > Bump this one. > > I want to know if we could get away without introducing a new pair of > counter. I wouldn't mind having my cycle counters back in the api ;) They just seem to be easier for me to work with in userspace. Or do you mean if we can just redefine the existing sampler? > For instance would running average of a period do for frequency > readout? It depends on what kind of error we are facing. Or a moving > average for some period? I would explore that but don't have an > Ivybridge so could you have a look? The crucial part is that we don't define the period, the user does. Heh, once again we have two different ideas about what we want to measure :) I'ld take cycles, with your suggestion we may as well do instantaneous frequency and sample from within perf_event_read (no averaging, or just a short ewma)? -Chris
On 30/05/2018 15:55, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-30 15:37:18) >> >> On 30/05/2018 12:55, Chris Wilson wrote: >>> hrtimer is not reliable enough to assume fixed intervals, and so even >>> coarse accuracy (in the face of kasan and similar heavy debugging) we >>> need to measure the actual interval between sample. >>> >>> While using a single timestamp to compute the interval does not allow >>> very fine accuracy (consider the impact of a slow forcewake between >>> different samples after the timestamp is read) is much better than >>> assuming the interval. >>> >>> v2: Make use of the sample period for estimating the GPU clock cycles, >>> leaving the frequency calculation (the averaging) to the caller. >>> Introduce new samplers for reporting cycles instead of Hz. >>> >>> Testcase: igt/perf_pmu #ivb >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- >>> drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ >>> include/uapi/drm/i915_drm.h | 2 ++ >>> 3 files changed, 43 insertions(+), 9 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c >>> index dc87797db500..12033e47e3b4 100644 >>> --- a/drivers/gpu/drm/i915/i915_pmu.c >>> +++ b/drivers/gpu/drm/i915/i915_pmu.c >>> @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) >>> */ >>> enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | >>> config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | >>> + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | >>> + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | >>> ENGINE_SAMPLE_MASK; >>> >>> /* >>> @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) >>> { >>> if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { >>> i915->pmu.timer_enabled = true; >>> + i915->pmu.timestamp = ktime_get(); >>> hrtimer_start_range_ns(&i915->pmu.timer, >>> ns_to_ktime(PERIOD), 0, >>> HRTIMER_MODE_REL_PINNED); >>> @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) >>> sample->cur += mul_u32_u32(val, unit); >>> } >>> >>> -static void engines_sample(struct drm_i915_private *dev_priv) >>> +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) >>> { >>> struct intel_engine_cs *engine; >>> enum intel_engine_id id; >>> @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>> val = !i915_seqno_passed(current_seqno, last_seqno); >>> >>> update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], >>> - PERIOD, val); >>> + period, val); >>> >>> if (val && (engine->pmu.enable & >>> (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { >>> @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>> } >>> >>> update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], >>> - PERIOD, !!(val & RING_WAIT)); >>> + period, !!(val & RING_WAIT)); >>> >>> update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], >>> - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); >>> + period, !!(val & RING_WAIT_SEMAPHORE)); >>> } >>> >>> if (fw) >>> @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>> intel_runtime_pm_put(dev_priv); >>> } >>> >>> -static void frequency_sample(struct drm_i915_private *dev_priv) >>> +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) >>> { >>> if (dev_priv->pmu.enable & >>> - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { >>> + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | >>> + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { >>> u32 val; >>> >>> val = dev_priv->gt_pm.rps.cur_freq; >>> @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) >>> >>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], >>> 1, intel_gpu_freq(dev_priv, val)); >>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], >>> + period, intel_gpu_freq(dev_priv, val)); >> >> Cache intel_gpu_freq in a local. >> >>> } >>> >>> if (dev_priv->pmu.enable & >>> - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { >>> + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | >>> + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { >>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, >>> intel_gpu_freq(dev_priv, >>> dev_priv->gt_pm.rps.cur_freq)); >>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], >>> + period, >>> + intel_gpu_freq(dev_priv, >>> + dev_priv->gt_pm.rps.cur_freq)); >> >> Same here. >> >>> } >>> } >>> >>> @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) >>> { >>> struct drm_i915_private *i915 = >>> container_of(hrtimer, struct drm_i915_private, pmu.timer); >>> + ktime_t now, period; >>> >>> if (!READ_ONCE(i915->pmu.timer_enabled)) >>> return HRTIMER_NORESTART; >>> >>> - engines_sample(i915); >>> - frequency_sample(i915); >>> + now = ktime_get(); >>> + period = ktime_sub(now, i915->pmu.timestamp); >>> + i915->pmu.timestamp = now; >>> + >>> + engines_sample(i915, period); >>> + frequency_sample(i915, period); >>> >>> hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); >>> return HRTIMER_RESTART; >>> @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) >>> { >>> switch (config) { >>> case I915_PMU_ACTUAL_FREQUENCY: >>> + case I915_PMU_ACTUAL_CLOCK: >>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) >>> /* Requires a mutex for sampling! */ >>> return -ENODEV; >>> /* Fall-through. */ >>> case I915_PMU_REQUESTED_FREQUENCY: >>> + case I915_PMU_REQUESTED_CLOCK: >>> if (INTEL_GEN(i915) < 6) >>> return -ENODEV; >>> break; >>> @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) >>> div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, >>> FREQUENCY); >>> break; >>> + case I915_PMU_ACTUAL_CLOCK: >>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; >>> + break; >>> + case I915_PMU_REQUESTED_CLOCK: >>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; >>> + break; >>> case I915_PMU_INTERRUPTS: >>> val = count_interrupts(i915); >>> break; >>> @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) >>> __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), >>> __event(I915_PMU_INTERRUPTS, "interrupts", NULL), >>> __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), >>> + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), >>> + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), >>> }; >>> static const struct { >>> enum drm_i915_pmu_engine_sample sample; >>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h >>> index 2ba735299f7c..9c4252d85b5e 100644 >>> --- a/drivers/gpu/drm/i915/i915_pmu.h >>> +++ b/drivers/gpu/drm/i915/i915_pmu.h >>> @@ -17,6 +17,8 @@ struct drm_i915_private; >>> enum { >>> __I915_SAMPLE_FREQ_ACT = 0, >>> __I915_SAMPLE_FREQ_REQ, >>> + __I915_SAMPLE_CLOCK_ACT, >>> + __I915_SAMPLE_CLOCK_REQ, >>> __I915_SAMPLE_RC6, >>> __I915_SAMPLE_RC6_ESTIMATED, >>> __I915_NUM_PMU_SAMPLERS >>> @@ -52,6 +54,10 @@ struct i915_pmu { >>> * @timer: Timer for internal i915 PMU sampling. >>> */ >>> struct hrtimer timer; >>> + /** >>> + * @timestamp: Timestamp of last internal i915 PMU sampling. >>> + */ >>> + ktime_t timestamp; >>> /** >>> * @enable: Bitmask of all currently enabled events. >>> * >>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >>> index 7f5634ce8e88..61ab71986274 100644 >>> --- a/include/uapi/drm/i915_drm.h >>> +++ b/include/uapi/drm/i915_drm.h >>> @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { >>> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) >>> #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) >>> #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) >>> +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) >>> +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) >>> >>> #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY >> >> Bump this one. >> >> I want to know if we could get away without introducing a new pair of >> counter. > > I wouldn't mind having my cycle counters back in the api ;) > They just seem to be easier for me to work with in userspace. > > Or do you mean if we can just redefine the existing sampler? Keep the existing semantics but improve implementation sufficiently so they survive the IVB hrtimer problem. >> For instance would running average of a period do for frequency >> readout? It depends on what kind of error we are facing. Or a moving >> average for some period? I would explore that but don't have an >> Ivybridge so could you have a look? > > The crucial part is that we don't define the period, the user does. > > Heh, once again we have two different ideas about what we want to > measure :) I'ld take cycles, with your suggestion we may as well do > instantaneous frequency and sample from within perf_event_read (no > averaging, or just a short ewma)? For me the key question is how unstable is the IVB clock? Is it random jitter and by how much, or what. If we keep x ms moving average from frequency_sample, and use that to convert to Hz on the output, would it be good enough? To me it is preferable to adding new counters. Especially if the error is so small that no one notices _and_ only on IVB. Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-05-30 16:27:02) > > On 30/05/2018 15:55, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2018-05-30 15:37:18) > >> > >> On 30/05/2018 12:55, Chris Wilson wrote: > >>> hrtimer is not reliable enough to assume fixed intervals, and so even > >>> coarse accuracy (in the face of kasan and similar heavy debugging) we > >>> need to measure the actual interval between sample. > >>> > >>> While using a single timestamp to compute the interval does not allow > >>> very fine accuracy (consider the impact of a slow forcewake between > >>> different samples after the timestamp is read) is much better than > >>> assuming the interval. > >>> > >>> v2: Make use of the sample period for estimating the GPU clock cycles, > >>> leaving the frequency calculation (the averaging) to the caller. > >>> Introduce new samplers for reporting cycles instead of Hz. > >>> > >>> Testcase: igt/perf_pmu #ivb > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>> --- > >>> drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- > >>> drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ > >>> include/uapi/drm/i915_drm.h | 2 ++ > >>> 3 files changed, 43 insertions(+), 9 deletions(-) > >>> > >>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > >>> index dc87797db500..12033e47e3b4 100644 > >>> --- a/drivers/gpu/drm/i915/i915_pmu.c > >>> +++ b/drivers/gpu/drm/i915/i915_pmu.c > >>> @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > >>> */ > >>> enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | > >>> config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | > >>> + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > >>> + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > >>> ENGINE_SAMPLE_MASK; > >>> > >>> /* > >>> @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) > >>> { > >>> if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { > >>> i915->pmu.timer_enabled = true; > >>> + i915->pmu.timestamp = ktime_get(); > >>> hrtimer_start_range_ns(&i915->pmu.timer, > >>> ns_to_ktime(PERIOD), 0, > >>> HRTIMER_MODE_REL_PINNED); > >>> @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) > >>> sample->cur += mul_u32_u32(val, unit); > >>> } > >>> > >>> -static void engines_sample(struct drm_i915_private *dev_priv) > >>> +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) > >>> { > >>> struct intel_engine_cs *engine; > >>> enum intel_engine_id id; > >>> @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) > >>> val = !i915_seqno_passed(current_seqno, last_seqno); > >>> > >>> update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], > >>> - PERIOD, val); > >>> + period, val); > >>> > >>> if (val && (engine->pmu.enable & > >>> (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { > >>> @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) > >>> } > >>> > >>> update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], > >>> - PERIOD, !!(val & RING_WAIT)); > >>> + period, !!(val & RING_WAIT)); > >>> > >>> update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], > >>> - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); > >>> + period, !!(val & RING_WAIT_SEMAPHORE)); > >>> } > >>> > >>> if (fw) > >>> @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) > >>> intel_runtime_pm_put(dev_priv); > >>> } > >>> > >>> -static void frequency_sample(struct drm_i915_private *dev_priv) > >>> +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) > >>> { > >>> if (dev_priv->pmu.enable & > >>> - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { > >>> + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | > >>> + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { > >>> u32 val; > >>> > >>> val = dev_priv->gt_pm.rps.cur_freq; > >>> @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) > >>> > >>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], > >>> 1, intel_gpu_freq(dev_priv, val)); > >>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], > >>> + period, intel_gpu_freq(dev_priv, val)); > >> > >> Cache intel_gpu_freq in a local. > >> > >>> } > >>> > >>> if (dev_priv->pmu.enable & > >>> - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { > >>> + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | > >>> + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { > >>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, > >>> intel_gpu_freq(dev_priv, > >>> dev_priv->gt_pm.rps.cur_freq)); > >>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], > >>> + period, > >>> + intel_gpu_freq(dev_priv, > >>> + dev_priv->gt_pm.rps.cur_freq)); > >> > >> Same here. > >> > >>> } > >>> } > >>> > >>> @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) > >>> { > >>> struct drm_i915_private *i915 = > >>> container_of(hrtimer, struct drm_i915_private, pmu.timer); > >>> + ktime_t now, period; > >>> > >>> if (!READ_ONCE(i915->pmu.timer_enabled)) > >>> return HRTIMER_NORESTART; > >>> > >>> - engines_sample(i915); > >>> - frequency_sample(i915); > >>> + now = ktime_get(); > >>> + period = ktime_sub(now, i915->pmu.timestamp); > >>> + i915->pmu.timestamp = now; > >>> + > >>> + engines_sample(i915, period); > >>> + frequency_sample(i915, period); > >>> > >>> hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); > >>> return HRTIMER_RESTART; > >>> @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) > >>> { > >>> switch (config) { > >>> case I915_PMU_ACTUAL_FREQUENCY: > >>> + case I915_PMU_ACTUAL_CLOCK: > >>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) > >>> /* Requires a mutex for sampling! */ > >>> return -ENODEV; > >>> /* Fall-through. */ > >>> case I915_PMU_REQUESTED_FREQUENCY: > >>> + case I915_PMU_REQUESTED_CLOCK: > >>> if (INTEL_GEN(i915) < 6) > >>> return -ENODEV; > >>> break; > >>> @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > >>> div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, > >>> FREQUENCY); > >>> break; > >>> + case I915_PMU_ACTUAL_CLOCK: > >>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; > >>> + break; > >>> + case I915_PMU_REQUESTED_CLOCK: > >>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; > >>> + break; > >>> case I915_PMU_INTERRUPTS: > >>> val = count_interrupts(i915); > >>> break; > >>> @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) > >>> __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), > >>> __event(I915_PMU_INTERRUPTS, "interrupts", NULL), > >>> __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), > >>> + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), > >>> + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), > >>> }; > >>> static const struct { > >>> enum drm_i915_pmu_engine_sample sample; > >>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h > >>> index 2ba735299f7c..9c4252d85b5e 100644 > >>> --- a/drivers/gpu/drm/i915/i915_pmu.h > >>> +++ b/drivers/gpu/drm/i915/i915_pmu.h > >>> @@ -17,6 +17,8 @@ struct drm_i915_private; > >>> enum { > >>> __I915_SAMPLE_FREQ_ACT = 0, > >>> __I915_SAMPLE_FREQ_REQ, > >>> + __I915_SAMPLE_CLOCK_ACT, > >>> + __I915_SAMPLE_CLOCK_REQ, > >>> __I915_SAMPLE_RC6, > >>> __I915_SAMPLE_RC6_ESTIMATED, > >>> __I915_NUM_PMU_SAMPLERS > >>> @@ -52,6 +54,10 @@ struct i915_pmu { > >>> * @timer: Timer for internal i915 PMU sampling. > >>> */ > >>> struct hrtimer timer; > >>> + /** > >>> + * @timestamp: Timestamp of last internal i915 PMU sampling. > >>> + */ > >>> + ktime_t timestamp; > >>> /** > >>> * @enable: Bitmask of all currently enabled events. > >>> * > >>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > >>> index 7f5634ce8e88..61ab71986274 100644 > >>> --- a/include/uapi/drm/i915_drm.h > >>> +++ b/include/uapi/drm/i915_drm.h > >>> @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { > >>> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) > >>> #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) > >>> #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) > >>> +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) > >>> +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) > >>> > >>> #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY > >> > >> Bump this one. > >> > >> I want to know if we could get away without introducing a new pair of > >> counter. > > > > I wouldn't mind having my cycle counters back in the api ;) > > They just seem to be easier for me to work with in userspace. > > > > Or do you mean if we can just redefine the existing sampler? > > Keep the existing semantics but improve implementation sufficiently so > they survive the IVB hrtimer problem. (Now that I've started, bring back the cycle counters!) > >> For instance would running average of a period do for frequency > >> readout? It depends on what kind of error we are facing. Or a moving > >> average for some period? I would explore that but don't have an > >> Ivybridge so could you have a look? > > > > The crucial part is that we don't define the period, the user does. > > > > Heh, once again we have two different ideas about what we want to > > measure :) I'ld take cycles, with your suggestion we may as well do > > instantaneous frequency and sample from within perf_event_read (no > > averaging, or just a short ewma)? > For me the key question is how unstable is the IVB clock? Is it random > jitter and by how much, or what. If we keep x ms moving average from > frequency_sample, and use that to convert to Hz on the output, would it > be good enough? We don't have that sort of test in perf_pmu. The closest we have was for gem_ctx_freq, and there the frequency sampler was not very accurate those (-100/+100 tolerances were not for thermal throttling). We can try doing the same sawtooths -- just the challenge of systematic errors in both the timer, the rps worker and the hw. > To me it is preferable to adding new counters. Especially if the error > is so small that no one notices _and_ only on IVB. I don't think it's fair to say that only IVB has a problem with the hrtimer, it's just where it's most visible. Or to rule it out being a problem for the future. The code is using hrtimer_forward() so it already assumes it can and will miss samples :) -Chris
On 30/05/2018 16:37, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-30 16:27:02) >> >> On 30/05/2018 15:55, Chris Wilson wrote: >>> Quoting Tvrtko Ursulin (2018-05-30 15:37:18) >>>> >>>> On 30/05/2018 12:55, Chris Wilson wrote: >>>>> hrtimer is not reliable enough to assume fixed intervals, and so even >>>>> coarse accuracy (in the face of kasan and similar heavy debugging) we >>>>> need to measure the actual interval between sample. >>>>> >>>>> While using a single timestamp to compute the interval does not allow >>>>> very fine accuracy (consider the impact of a slow forcewake between >>>>> different samples after the timestamp is read) is much better than >>>>> assuming the interval. >>>>> >>>>> v2: Make use of the sample period for estimating the GPU clock cycles, >>>>> leaving the frequency calculation (the averaging) to the caller. >>>>> Introduce new samplers for reporting cycles instead of Hz. >>>>> >>>>> Testcase: igt/perf_pmu #ivb >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>>> --- >>>>> drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- >>>>> drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ >>>>> include/uapi/drm/i915_drm.h | 2 ++ >>>>> 3 files changed, 43 insertions(+), 9 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c >>>>> index dc87797db500..12033e47e3b4 100644 >>>>> --- a/drivers/gpu/drm/i915/i915_pmu.c >>>>> +++ b/drivers/gpu/drm/i915/i915_pmu.c >>>>> @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) >>>>> */ >>>>> enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | >>>>> config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | >>>>> + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | >>>>> + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | >>>>> ENGINE_SAMPLE_MASK; >>>>> >>>>> /* >>>>> @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) >>>>> { >>>>> if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { >>>>> i915->pmu.timer_enabled = true; >>>>> + i915->pmu.timestamp = ktime_get(); >>>>> hrtimer_start_range_ns(&i915->pmu.timer, >>>>> ns_to_ktime(PERIOD), 0, >>>>> HRTIMER_MODE_REL_PINNED); >>>>> @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) >>>>> sample->cur += mul_u32_u32(val, unit); >>>>> } >>>>> >>>>> -static void engines_sample(struct drm_i915_private *dev_priv) >>>>> +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) >>>>> { >>>>> struct intel_engine_cs *engine; >>>>> enum intel_engine_id id; >>>>> @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>>>> val = !i915_seqno_passed(current_seqno, last_seqno); >>>>> >>>>> update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], >>>>> - PERIOD, val); >>>>> + period, val); >>>>> >>>>> if (val && (engine->pmu.enable & >>>>> (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { >>>>> @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>>>> } >>>>> >>>>> update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], >>>>> - PERIOD, !!(val & RING_WAIT)); >>>>> + period, !!(val & RING_WAIT)); >>>>> >>>>> update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], >>>>> - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); >>>>> + period, !!(val & RING_WAIT_SEMAPHORE)); >>>>> } >>>>> >>>>> if (fw) >>>>> @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) >>>>> intel_runtime_pm_put(dev_priv); >>>>> } >>>>> >>>>> -static void frequency_sample(struct drm_i915_private *dev_priv) >>>>> +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) >>>>> { >>>>> if (dev_priv->pmu.enable & >>>>> - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { >>>>> + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | >>>>> + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { >>>>> u32 val; >>>>> >>>>> val = dev_priv->gt_pm.rps.cur_freq; >>>>> @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) >>>>> >>>>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], >>>>> 1, intel_gpu_freq(dev_priv, val)); >>>>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], >>>>> + period, intel_gpu_freq(dev_priv, val)); >>>> >>>> Cache intel_gpu_freq in a local. >>>> >>>>> } >>>>> >>>>> if (dev_priv->pmu.enable & >>>>> - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { >>>>> + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | >>>>> + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { >>>>> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, >>>>> intel_gpu_freq(dev_priv, >>>>> dev_priv->gt_pm.rps.cur_freq)); >>>>> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], >>>>> + period, >>>>> + intel_gpu_freq(dev_priv, >>>>> + dev_priv->gt_pm.rps.cur_freq)); >>>> >>>> Same here. >>>> >>>>> } >>>>> } >>>>> >>>>> @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) >>>>> { >>>>> struct drm_i915_private *i915 = >>>>> container_of(hrtimer, struct drm_i915_private, pmu.timer); >>>>> + ktime_t now, period; >>>>> >>>>> if (!READ_ONCE(i915->pmu.timer_enabled)) >>>>> return HRTIMER_NORESTART; >>>>> >>>>> - engines_sample(i915); >>>>> - frequency_sample(i915); >>>>> + now = ktime_get(); >>>>> + period = ktime_sub(now, i915->pmu.timestamp); >>>>> + i915->pmu.timestamp = now; >>>>> + >>>>> + engines_sample(i915, period); >>>>> + frequency_sample(i915, period); >>>>> >>>>> hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); >>>>> return HRTIMER_RESTART; >>>>> @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) >>>>> { >>>>> switch (config) { >>>>> case I915_PMU_ACTUAL_FREQUENCY: >>>>> + case I915_PMU_ACTUAL_CLOCK: >>>>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) >>>>> /* Requires a mutex for sampling! */ >>>>> return -ENODEV; >>>>> /* Fall-through. */ >>>>> case I915_PMU_REQUESTED_FREQUENCY: >>>>> + case I915_PMU_REQUESTED_CLOCK: >>>>> if (INTEL_GEN(i915) < 6) >>>>> return -ENODEV; >>>>> break; >>>>> @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) >>>>> div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, >>>>> FREQUENCY); >>>>> break; >>>>> + case I915_PMU_ACTUAL_CLOCK: >>>>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; >>>>> + break; >>>>> + case I915_PMU_REQUESTED_CLOCK: >>>>> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; >>>>> + break; >>>>> case I915_PMU_INTERRUPTS: >>>>> val = count_interrupts(i915); >>>>> break; >>>>> @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) >>>>> __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), >>>>> __event(I915_PMU_INTERRUPTS, "interrupts", NULL), >>>>> __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), >>>>> + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), >>>>> + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), >>>>> }; >>>>> static const struct { >>>>> enum drm_i915_pmu_engine_sample sample; >>>>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h >>>>> index 2ba735299f7c..9c4252d85b5e 100644 >>>>> --- a/drivers/gpu/drm/i915/i915_pmu.h >>>>> +++ b/drivers/gpu/drm/i915/i915_pmu.h >>>>> @@ -17,6 +17,8 @@ struct drm_i915_private; >>>>> enum { >>>>> __I915_SAMPLE_FREQ_ACT = 0, >>>>> __I915_SAMPLE_FREQ_REQ, >>>>> + __I915_SAMPLE_CLOCK_ACT, >>>>> + __I915_SAMPLE_CLOCK_REQ, >>>>> __I915_SAMPLE_RC6, >>>>> __I915_SAMPLE_RC6_ESTIMATED, >>>>> __I915_NUM_PMU_SAMPLERS >>>>> @@ -52,6 +54,10 @@ struct i915_pmu { >>>>> * @timer: Timer for internal i915 PMU sampling. >>>>> */ >>>>> struct hrtimer timer; >>>>> + /** >>>>> + * @timestamp: Timestamp of last internal i915 PMU sampling. >>>>> + */ >>>>> + ktime_t timestamp; >>>>> /** >>>>> * @enable: Bitmask of all currently enabled events. >>>>> * >>>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >>>>> index 7f5634ce8e88..61ab71986274 100644 >>>>> --- a/include/uapi/drm/i915_drm.h >>>>> +++ b/include/uapi/drm/i915_drm.h >>>>> @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { >>>>> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) >>>>> #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) >>>>> #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) >>>>> +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) >>>>> +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) >>>>> >>>>> #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY >>>> >>>> Bump this one. >>>> >>>> I want to know if we could get away without introducing a new pair of >>>> counter. >>> >>> I wouldn't mind having my cycle counters back in the api ;) >>> They just seem to be easier for me to work with in userspace. >>> >>> Or do you mean if we can just redefine the existing sampler? >> >> Keep the existing semantics but improve implementation sufficiently so >> they survive the IVB hrtimer problem. > > (Now that I've started, bring back the cycle counters!) I don't know when we had them. In some older version of your PMU prototype? Why they are so useful? >>>> For instance would running average of a period do for frequency >>>> readout? It depends on what kind of error we are facing. Or a moving >>>> average for some period? I would explore that but don't have an >>>> Ivybridge so could you have a look? >>> >>> The crucial part is that we don't define the period, the user does. >>> >>> Heh, once again we have two different ideas about what we want to >>> measure :) I'ld take cycles, with your suggestion we may as well do >>> instantaneous frequency and sample from within perf_event_read (no >>> averaging, or just a short ewma)? >> For me the key question is how unstable is the IVB clock? Is it random >> jitter and by how much, or what. If we keep x ms moving average from >> frequency_sample, and use that to convert to Hz on the output, would it >> be good enough? > > We don't have that sort of test in perf_pmu. The closest we have was for > gem_ctx_freq, and there the frequency sampler was not very accurate > those (-100/+100 tolerances were not for thermal throttling). We can try > doing the same sawtooths -- just the challenge of systematic errors in > both the timer, the rps worker and the hw. > >> To me it is preferable to adding new counters. Especially if the error >> is so small that no one notices _and_ only on IVB. > > I don't think it's fair to say that only IVB has a problem with the > hrtimer, it's just where it's most visible. Or to rule it out being a I agree it is a good idea in general to measure the period but am still thinking if we can avoid adding new counters. > problem for the future. The code is using hrtimer_forward() so it > already assumes it can and will miss samples :) Hm yeah, we are drifting by however long it takes to calculate our stuff. It can be improved by taking time at the beginning of the callback and forwarding the timer relative to that (hrtimer_forward). I wonder if that would help on IVB. I'll check if it makes a difference on SKL for existing accuracy tests. But I never considered a complete missed sample. I'll trybot a patch to see if it happens anywhere. Regard, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index dc87797db500..12033e47e3b4 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) */ enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) | config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) | + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | ENGINE_SAMPLE_MASK; /* @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915) { if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) { i915->pmu.timer_enabled = true; + i915->pmu.timestamp = ktime_get(); hrtimer_start_range_ns(&i915->pmu.timer, ns_to_ktime(PERIOD), 0, HRTIMER_MODE_REL_PINNED); @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val) sample->cur += mul_u32_u32(val, unit); } -static void engines_sample(struct drm_i915_private *dev_priv) +static void engines_sample(struct drm_i915_private *dev_priv, u64 period) { struct intel_engine_cs *engine; enum intel_engine_id id; @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv) val = !i915_seqno_passed(current_seqno, last_seqno); update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY], - PERIOD, val); + period, val); if (val && (engine->pmu.enable & (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) { @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv) } update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT], - PERIOD, !!(val & RING_WAIT)); + period, !!(val & RING_WAIT)); update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA], - PERIOD, !!(val & RING_WAIT_SEMAPHORE)); + period, !!(val & RING_WAIT_SEMAPHORE)); } if (fw) @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) intel_runtime_pm_put(dev_priv); } -static void frequency_sample(struct drm_i915_private *dev_priv) +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period) { if (dev_priv->pmu.enable & - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) { + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) | + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) { u32 val; val = dev_priv->gt_pm.rps.cur_freq; @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv) update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT], 1, intel_gpu_freq(dev_priv, val)); + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT], + period, intel_gpu_freq(dev_priv, val)); } if (dev_priv->pmu.enable & - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) { + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) | + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) { update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1, intel_gpu_freq(dev_priv, dev_priv->gt_pm.rps.cur_freq)); + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ], + period, + intel_gpu_freq(dev_priv, + dev_priv->gt_pm.rps.cur_freq)); } } @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) { struct drm_i915_private *i915 = container_of(hrtimer, struct drm_i915_private, pmu.timer); + ktime_t now, period; if (!READ_ONCE(i915->pmu.timer_enabled)) return HRTIMER_NORESTART; - engines_sample(i915); - frequency_sample(i915); + now = ktime_get(); + period = ktime_sub(now, i915->pmu.timestamp); + i915->pmu.timestamp = now; + + engines_sample(i915, period); + frequency_sample(i915, period); hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD)); return HRTIMER_RESTART; @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config) { switch (config) { case I915_PMU_ACTUAL_FREQUENCY: + case I915_PMU_ACTUAL_CLOCK: if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) /* Requires a mutex for sampling! */ return -ENODEV; /* Fall-through. */ case I915_PMU_REQUESTED_FREQUENCY: + case I915_PMU_REQUESTED_CLOCK: if (INTEL_GEN(i915) < 6) return -ENODEV; break; @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, FREQUENCY); break; + case I915_PMU_ACTUAL_CLOCK: + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur; + break; + case I915_PMU_REQUESTED_CLOCK: + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur; + break; case I915_PMU_INTERRUPTS: val = count_interrupts(i915); break; @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915) __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"), __event(I915_PMU_INTERRUPTS, "interrupts", NULL), __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"), + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"), + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"), }; static const struct { enum drm_i915_pmu_engine_sample sample; diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 2ba735299f7c..9c4252d85b5e 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -17,6 +17,8 @@ struct drm_i915_private; enum { __I915_SAMPLE_FREQ_ACT = 0, __I915_SAMPLE_FREQ_REQ, + __I915_SAMPLE_CLOCK_ACT, + __I915_SAMPLE_CLOCK_REQ, __I915_SAMPLE_RC6, __I915_SAMPLE_RC6_ESTIMATED, __I915_NUM_PMU_SAMPLERS @@ -52,6 +54,10 @@ struct i915_pmu { * @timer: Timer for internal i915 PMU sampling. */ struct hrtimer timer; + /** + * @timestamp: Timestamp of last internal i915 PMU sampling. + */ + ktime_t timestamp; /** * @enable: Bitmask of all currently enabled events. * diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 7f5634ce8e88..61ab71986274 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4) +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5) #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY
hrtimer is not reliable enough to assume fixed intervals, and so even coarse accuracy (in the face of kasan and similar heavy debugging) we need to measure the actual interval between sample. While using a single timestamp to compute the interval does not allow very fine accuracy (consider the impact of a slow forcewake between different samples after the timestamp is read) is much better than assuming the interval. v2: Make use of the sample period for estimating the GPU clock cycles, leaving the frequency calculation (the averaging) to the caller. Introduce new samplers for reporting cycles instead of Hz. Testcase: igt/perf_pmu #ivb Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++------- drivers/gpu/drm/i915/i915_pmu.h | 6 +++++ include/uapi/drm/i915_drm.h | 2 ++ 3 files changed, 43 insertions(+), 9 deletions(-)