Message ID | 20170929123500.3186-7-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 29/09/2017 13:34, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > We can use engine busy stats instead of the sampling timer for > better accuracy. > > By doing this we replace the stohastic sampling with busyness > metric derived directly from engine activity. This is context > switch interrupt driven, so as accurate as we can get from > software tracking. To quantify the precision benefit with a graph: https://people.freedesktop.org/~tursulin/sampling-vs-busy-stats.png This represents the render engine busyness on neverball menu screen looping a few times. PMU sampling interval is 100ms. It was generated from a special build with both PMU counters running in parallel so the data is absolutely aligned. Anomaly of busyness going over 100% is caused by me not bothering to use the actual timestamps got from PMU, but using fixed interval (perf stat -I 100). But I don't think it matters for the rough comparison and looking at the noise and error in the sampling version. > As a secondary benefit, we can also not run the sampling timer > in cases only busyness metric is enabled. For this one data is not showing anything significant. I've seen the i915_sample function use <0.1% of CPU time but that's it. Probably with the original version which used MMIO it was much worse. Regards, Tvrtko > v2: Rebase. > v3: > * Rebase, comments. > * Leave engine busyness controls out of workers. > v4: Checkpatch cleanup. > v5: Added comment to pmu_needs_timer change. > v6: > * Rebase. > * Fix style of some comments. (Chris Wilson) > v7: Rebase and commit message update. (Chris Wilson) > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_pmu.c | 37 +++++++++++++++++++++++++++++++-- > drivers/gpu/drm/i915/intel_ringbuffer.h | 5 +++++ > 2 files changed, 40 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > index f341c904c159..93c0e7ec7d75 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.c > +++ b/drivers/gpu/drm/i915/i915_pmu.c > @@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event *event) > return config_enabled_bit(event->attr.config); > } > > +static bool supports_busy_stats(void) > +{ > + return i915_modparams.enable_execlists; > +} > + > static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > { > u64 enable; > @@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > */ > if (!gpu_active) > enable &= ~ENGINE_SAMPLE_MASK; > + /* > + * Also there is software busyness tracking available we do not > + * need the timer for I915_SAMPLE_BUSY counter. > + */ > + else if (supports_busy_stats()) > + enable &= ~BIT(I915_SAMPLE_BUSY); > > /* > * If some bits remain it means we need the sampling timer running. > @@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > > if (WARN_ON_ONCE(!engine)) { > /* Do nothing */ > + } else if (sample == I915_SAMPLE_BUSY && > + engine->pmu.busy_stats) { > + val = ktime_to_ns(intel_engine_get_busy_time(engine)); > } else { > val = engine->pmu.sample[sample].cur; > } > @@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event) > local64_add(new - prev, &event->count); > } > > +static bool engine_needs_busy_stats(struct intel_engine_cs *engine) > +{ > + return supports_busy_stats() && > + (engine->pmu.enable & BIT(I915_SAMPLE_BUSY)); > +} > + > static void i915_pmu_enable(struct perf_event *event) > { > struct drm_i915_private *i915 = > @@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event *event) > > GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS); > GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0); > - engine->pmu.enable_count[sample]++; > + if (engine->pmu.enable_count[sample]++ == 0) { > + if (engine_needs_busy_stats(engine) && > + !engine->pmu.busy_stats) { > + engine->pmu.busy_stats = > + intel_enable_engine_stats(engine) == 0; > + WARN_ON_ONCE(!engine->pmu.busy_stats); > + } > + } > } > > /* > @@ -473,8 +500,14 @@ static void i915_pmu_disable(struct perf_event *event) > * Decrement the reference count and clear the enabled > * bitmask when the last listener on an event goes away. > */ > - if (--engine->pmu.enable_count[sample] == 0) > + if (--engine->pmu.enable_count[sample] == 0) { > engine->pmu.enable &= ~BIT(sample); > + if (!engine_needs_busy_stats(engine) && > + engine->pmu.busy_stats) { > + engine->pmu.busy_stats = false; > + intel_disable_engine_stats(engine); > + } > + } > } > > GEM_BUG_ON(bit >= I915_PMU_MASK_BITS); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index ee738c7694e5..0f8ccb243407 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -351,6 +351,11 @@ struct intel_engine_cs { > * Our internal timer stores the current counters in this field. > */ > struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; > + /** > + * @busy_stats: Has enablement of engine stats tracking been > + * requested. > + */ > + bool busy_stats; > } pmu; > > /* >
Nice data, thank you! -----Original Message----- From: Tvrtko Ursulin [mailto:tvrtko.ursulin@linux.intel.com] Sent: Wednesday, October 4, 2017 10:35 AM To: Tvrtko Ursulin <tursulin@ursulin.net>; Intel-gfx@lists.freedesktop.org Cc: Chris Wilson <chris@chris-wilson.co.uk>; Rogozhkin, Dmitry V <dmitry.v.rogozhkin@intel.com> Subject: Re: [Intel-gfx] [PATCH 06/10] drm/i915/pmu: Wire up engine busy stats to PMU On 29/09/2017 13:34, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > We can use engine busy stats instead of the sampling timer for better > accuracy. > > By doing this we replace the stohastic sampling with busyness metric > derived directly from engine activity. This is context switch > interrupt driven, so as accurate as we can get from software tracking. To quantify the precision benefit with a graph: https://people.freedesktop.org/~tursulin/sampling-vs-busy-stats.png This represents the render engine busyness on neverball menu screen looping a few times. PMU sampling interval is 100ms. It was generated from a special build with both PMU counters running in parallel so the data is absolutely aligned. Anomaly of busyness going over 100% is caused by me not bothering to use the actual timestamps got from PMU, but using fixed interval (perf stat -I 100). But I don't think it matters for the rough comparison and looking at the noise and error in the sampling version. > As a secondary benefit, we can also not run the sampling timer in > cases only busyness metric is enabled. For this one data is not showing anything significant. I've seen the i915_sample function use <0.1% of CPU time but that's it. Probably with the original version which used MMIO it was much worse. Regards, Tvrtko > v2: Rebase. > v3: > * Rebase, comments. > * Leave engine busyness controls out of workers. > v4: Checkpatch cleanup. > v5: Added comment to pmu_needs_timer change. > v6: > * Rebase. > * Fix style of some comments. (Chris Wilson) > v7: Rebase and commit message update. (Chris Wilson) > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_pmu.c | 37 +++++++++++++++++++++++++++++++-- > drivers/gpu/drm/i915/intel_ringbuffer.h | 5 +++++ > 2 files changed, 40 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c > b/drivers/gpu/drm/i915/i915_pmu.c index f341c904c159..93c0e7ec7d75 > 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.c > +++ b/drivers/gpu/drm/i915/i915_pmu.c > @@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event *event) > return config_enabled_bit(event->attr.config); > } > > +static bool supports_busy_stats(void) { > + return i915_modparams.enable_execlists; } > + > static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > { > u64 enable; > @@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) > */ > if (!gpu_active) > enable &= ~ENGINE_SAMPLE_MASK; > + /* > + * Also there is software busyness tracking available we do not > + * need the timer for I915_SAMPLE_BUSY counter. > + */ > + else if (supports_busy_stats()) > + enable &= ~BIT(I915_SAMPLE_BUSY); > > /* > * If some bits remain it means we need the sampling timer running. > @@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event > *event) > > if (WARN_ON_ONCE(!engine)) { > /* Do nothing */ > + } else if (sample == I915_SAMPLE_BUSY && > + engine->pmu.busy_stats) { > + val = ktime_to_ns(intel_engine_get_busy_time(engine)); > } else { > val = engine->pmu.sample[sample].cur; > } > @@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event) > local64_add(new - prev, &event->count); > } > > +static bool engine_needs_busy_stats(struct intel_engine_cs *engine) { > + return supports_busy_stats() && > + (engine->pmu.enable & BIT(I915_SAMPLE_BUSY)); } > + > static void i915_pmu_enable(struct perf_event *event) > { > struct drm_i915_private *i915 = > @@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event > *event) > > GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS); > GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0); > - engine->pmu.enable_count[sample]++; > + if (engine->pmu.enable_count[sample]++ == 0) { > + if (engine_needs_busy_stats(engine) && > + !engine->pmu.busy_stats) { > + engine->pmu.busy_stats = > + intel_enable_engine_stats(engine) == 0; > + WARN_ON_ONCE(!engine->pmu.busy_stats); > + } > + } > } > > /* > @@ -473,8 +500,14 @@ static void i915_pmu_disable(struct perf_event *event) > * Decrement the reference count and clear the enabled > * bitmask when the last listener on an event goes away. > */ > - if (--engine->pmu.enable_count[sample] == 0) > + if (--engine->pmu.enable_count[sample] == 0) { > engine->pmu.enable &= ~BIT(sample); > + if (!engine_needs_busy_stats(engine) && > + engine->pmu.busy_stats) { > + engine->pmu.busy_stats = false; > + intel_disable_engine_stats(engine); > + } > + } > } > > GEM_BUG_ON(bit >= I915_PMU_MASK_BITS); diff --git > a/drivers/gpu/drm/i915/intel_ringbuffer.h > b/drivers/gpu/drm/i915/intel_ringbuffer.h > index ee738c7694e5..0f8ccb243407 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -351,6 +351,11 @@ struct intel_engine_cs { > * Our internal timer stores the current counters in this field. > */ > struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; > + /** > + * @busy_stats: Has enablement of engine stats tracking been > + * requested. > + */ > + bool busy_stats; > } pmu; > > /* >
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index f341c904c159..93c0e7ec7d75 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event *event) return config_enabled_bit(event->attr.config); } +static bool supports_busy_stats(void) +{ + return i915_modparams.enable_execlists; +} + static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) { u64 enable; @@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) */ if (!gpu_active) enable &= ~ENGINE_SAMPLE_MASK; + /* + * Also there is software busyness tracking available we do not + * need the timer for I915_SAMPLE_BUSY counter. + */ + else if (supports_busy_stats()) + enable &= ~BIT(I915_SAMPLE_BUSY); /* * If some bits remain it means we need the sampling timer running. @@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) if (WARN_ON_ONCE(!engine)) { /* Do nothing */ + } else if (sample == I915_SAMPLE_BUSY && + engine->pmu.busy_stats) { + val = ktime_to_ns(intel_engine_get_busy_time(engine)); } else { val = engine->pmu.sample[sample].cur; } @@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event) local64_add(new - prev, &event->count); } +static bool engine_needs_busy_stats(struct intel_engine_cs *engine) +{ + return supports_busy_stats() && + (engine->pmu.enable & BIT(I915_SAMPLE_BUSY)); +} + static void i915_pmu_enable(struct perf_event *event) { struct drm_i915_private *i915 = @@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event *event) GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS); GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0); - engine->pmu.enable_count[sample]++; + if (engine->pmu.enable_count[sample]++ == 0) { + if (engine_needs_busy_stats(engine) && + !engine->pmu.busy_stats) { + engine->pmu.busy_stats = + intel_enable_engine_stats(engine) == 0; + WARN_ON_ONCE(!engine->pmu.busy_stats); + } + } } /* @@ -473,8 +500,14 @@ static void i915_pmu_disable(struct perf_event *event) * Decrement the reference count and clear the enabled * bitmask when the last listener on an event goes away. */ - if (--engine->pmu.enable_count[sample] == 0) + if (--engine->pmu.enable_count[sample] == 0) { engine->pmu.enable &= ~BIT(sample); + if (!engine_needs_busy_stats(engine) && + engine->pmu.busy_stats) { + engine->pmu.busy_stats = false; + intel_disable_engine_stats(engine); + } + } } GEM_BUG_ON(bit >= I915_PMU_MASK_BITS); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index ee738c7694e5..0f8ccb243407 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -351,6 +351,11 @@ struct intel_engine_cs { * Our internal timer stores the current counters in this field. */ struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; + /** + * @busy_stats: Has enablement of engine stats tracking been + * requested. + */ + bool busy_stats; } pmu; /*