Message ID | 20230922222510.2235213-4-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Engine busyness v2 | expand |
On 22/09/2023 23:25, John.C.Harrison@Intel.com wrote: > From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > > In new version of GuC engine busyness, GuC provides engine busyness > ticks as a 64 bit counter. Add a new counter to relay this value to the > user as is. > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > --- > drivers/gpu/drm/i915/gt/intel_engine.h | 1 + > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +++++ > drivers/gpu/drm/i915/gt/intel_engine_types.h | 12 ++++ > drivers/gpu/drm/i915/gt/intel_engine_user.c | 1 + > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++++++++++++++----- > drivers/gpu/drm/i915/i915_pmu.c | 25 ++++++- > drivers/gpu/drm/i915/i915_pmu.h | 2 +- > include/uapi/drm/i915_drm.h | 13 +++- > 8 files changed, 116 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h > index b58c30ac8ef02..57af7ec8ecd82 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head *requests, > > ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, > ktime_t *now); > +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine); > > void intel_engine_get_hung_entity(struct intel_engine_cs *engine, > struct intel_context **ce, struct i915_request **rq); > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 84a75c95f3f7d..1c9ffb1ae9889 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) > return engine->busyness(engine, now); > } > > +/** > + * intel_engine_get_busy_ticks() - Return current accumulated engine busyness > + * ticks > + * @engine: engine to report on > + * > + * Returns accumulated ticks @engine was busy since engine stats were enabled. > + */ > +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine) > +{ > + if (!engine->busyness_ticks || > + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS)) > + return 0; > + > + return engine->busyness_ticks(engine); > +} > + > struct intel_context * > intel_engine_create_virtual(struct intel_engine_cs **siblings, > unsigned int count, unsigned long flags) > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 40fd8f984d64b..a88d40c74d604 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -548,6 +548,11 @@ struct intel_engine_cs { > ktime_t (*busyness)(struct intel_engine_cs *engine, > ktime_t *now); > > + /* > + * Get engine busyness ticks > + */ > + u64 (*busyness_ticks)(struct intel_engine_cs *engine); > + > struct intel_engine_execlists execlists; > > /* > @@ -574,6 +579,7 @@ struct intel_engine_cs { > #define I915_ENGINE_HAS_EU_PRIORITY BIT(10) > #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11) > #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12) > +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13) > unsigned int flags; > > /* > @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs *engine) > return engine->flags & I915_ENGINE_SUPPORTS_STATS; > } > > +static inline bool > +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine) > +{ > + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS; > +} > + > static inline bool > intel_engine_has_preemption(const struct intel_engine_cs *engine) > { > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c > index dcedff41a825f..69eb610b5ab0a 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c > @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private *i915) > MAP(HAS_PREEMPTION, PREEMPTION), > MAP(HAS_SEMAPHORES, SEMAPHORES), > MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), > + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS), > #undef MAP > }; > struct intel_engine_cs *engine; > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > index 0c1fee5360777..71749fb9ad35b 100644 > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > @@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) > guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; > } > > -/* > - * Unlike the execlist mode of submission total and active times are in terms of > - * gt clocks. The *now parameter is retained to return the cpu time at which the > - * busyness was sampled. > - */ > -static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) > +static u64 __busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine, ktime_t *now_out) > { > struct intel_engine_guc_stats_v1 stats_saved, *stats = &engine->stats.guc_v1; > struct i915_gpu_error *gpu_error = &engine->i915->gpu_error; > @@ -1304,6 +1299,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > unsigned long flags; > u32 reset_count; > bool in_reset; > + ktime_t now; > > spin_lock_irqsave(&guc->busy.v1.lock, flags); > > @@ -1318,7 +1314,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > reset_count = i915_reset_count(gpu_error); > in_reset = test_bit(I915_RESET_BACKOFF, >->reset.flags); > > - *now = ktime_get(); > + now = ktime_get(); > > /* > * The active busyness depends on start_gt_clk and gt_stamp. > @@ -1334,7 +1330,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > * start_gt_clk' calculation below for active engines. > */ > busy_v1_guc_update_engine_gt_clks(engine); > - busy_v1_guc_update_pm_timestamp(guc, now); > + busy_v1_guc_update_pm_timestamp(guc, &now); > intel_gt_pm_put_async(gt); > if (i915_reset_count(gpu_error) != reset_count) { > *stats = stats_saved; > @@ -1342,16 +1338,37 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > } > } > > - total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks); > + total = stats->total_gt_clks; > if (stats->running) { > u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk; > > - total += intel_gt_clock_interval_to_ns(gt, clk); > + total += clk; > } > > spin_unlock_irqrestore(&guc->busy.v1.lock, flags); > > - return ns_to_ktime(total); > + if (now_out) > + *now_out = now; > + > + return total; > +} > + > +/* > + * Unlike the execlist mode of submission total and active times are in terms of > + * gt clocks. The *now parameter is retained to return the cpu time at which the > + * busyness was sampled. > + */ > +static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) > +{ > + u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now); > + u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks); > + > + return ns_to_ktime(ns); > +} > + > +static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine) > +{ > + return __busy_v1_guc_engine_busyness_ticks(engine, NULL); > } > > static void busy_v1_guc_enable_worker(struct intel_guc *guc) > @@ -1607,6 +1624,16 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime > return ns_to_ktime(total); > } > > +static u64 busy_v2_guc_engine_busyness_ticks(struct intel_engine_cs *engine) > +{ > + struct intel_guc *guc = &engine->gt->uc.guc; > + u64 ticks_engine; > + > + __busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, NULL, NULL); > + > + return ticks_engine; > +} > + > static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc) > { > return guc->busy.v1.gt_stamp; > @@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) > engine->emit_flush = gen12_emit_flush_xcs; > } > engine->set_default_submission = guc_set_default_submission; > - if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) > - engine->busyness = busy_v1_guc_engine_busyness; > - else > - engine->busyness = busy_v2_guc_engine_busyness; > + if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) { > + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) > + engine->busyness = busy_v1_guc_engine_busyness; > + engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks; > + } else { > + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) > + engine->busyness = busy_v2_guc_engine_busyness; > + engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks; > + } > + > + if (engine->busyness) > + engine->flags |= I915_ENGINE_SUPPORTS_STATS; > > - engine->flags |= I915_ENGINE_SUPPORTS_STATS; > + engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS; > engine->flags |= I915_ENGINE_HAS_PREEMPTION; > engine->flags |= I915_ENGINE_HAS_TIMESLICES; > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > index 4f52636eb4a80..1b859556644f6 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.c > +++ b/drivers/gpu/drm/i915/i915_pmu.c > @@ -26,7 +26,8 @@ > #define ENGINE_SAMPLE_MASK \ > (BIT(I915_SAMPLE_BUSY) | \ > BIT(I915_SAMPLE_WAIT) | \ > - BIT(I915_SAMPLE_SEMA)) > + BIT(I915_SAMPLE_SEMA) | \ > + BIT(I915_SAMPLE_BUSY_TICKS)) This mask is for counters which may need sampling and AFAIU the new method should not apply. > > static cpumask_t i915_pmu_cpumask; > static unsigned int i915_pmu_target_cpu = -1; > @@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu) > * Also there is software busyness tracking available we do not > * need the timer for I915_SAMPLE_BUSY counter. > */ > - if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) > + if ((i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) || > + (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) { > enable &= ~BIT(I915_SAMPLE_BUSY); > + enable &= ~BIT(I915_SAMPLE_BUSY_TICKS); > + } > > /* > * If some bits remain it means we need the sampling timer running. > @@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs *engine, unsigned int period_ns > add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); > > /* No need to sample when busy stats are supported. */ > - if (intel_engine_supports_stats(engine)) > + if (intel_engine_supports_stats(engine) || > + intel_engine_supports_tick_stats(engine)) > return; > > /* > @@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine, > { > switch (sample) { > case I915_SAMPLE_BUSY: > + if (!intel_engine_supports_stats(engine)) > + return -ENODEV; > + break; Why is this changing the logic for the existing counter? It is not breaking it for all gen < 8? > + case I915_SAMPLE_BUSY_TICKS: > + if (!intel_engine_supports_tick_stats(engine)) > + return -ENODEV; > + break; > case I915_SAMPLE_WAIT: > break; > case I915_SAMPLE_SEMA: > @@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > > val = ktime_to_ns(intel_engine_get_busy_time(engine, > &unused)); > + } else if (sample == I915_SAMPLE_BUSY_TICKS && > + intel_engine_supports_tick_stats(engine)) { > + val = intel_engine_get_busy_ticks(engine); > } else { > val = engine->pmu.sample[sample].cur; > } > @@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu) > __engine_event(I915_SAMPLE_BUSY, "busy"), > __engine_event(I915_SAMPLE_SEMA, "sema"), > __engine_event(I915_SAMPLE_WAIT, "wait"), > + __engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"), > }; > unsigned int count = 0; > struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; > @@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu) > engine->uabi_instance, > engine_events[i].sample)); > > + if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS) > + continue; Okay, or add the unit explicitly in the descriptor array as global eventts do. Not much real difference. > + > str = kasprintf(GFP_KERNEL, "%s-%s.unit", > engine->name, engine_events[i].name); > if (!str) > diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h > index 41af038c37388..72a9c71944f8d 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.h > +++ b/drivers/gpu/drm/i915/i915_pmu.h > @@ -49,7 +49,7 @@ enum { > (I915_ENGINE_SAMPLE_COUNT + \ > I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT) > > -#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1) > +#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1) Are we sure the enabled status tracking is needed for a non-sampling event? Regards, Tvrtko > struct i915_pmu_sample { > u64 cur; > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index e26dd27ff4a5f..8ae98c1bda0ea 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -257,7 +257,8 @@ struct i915_engine_class_instance { > enum drm_i915_pmu_engine_sample { > I915_SAMPLE_BUSY = 0, > I915_SAMPLE_WAIT = 1, > - I915_SAMPLE_SEMA = 2 > + I915_SAMPLE_SEMA = 2, > + I915_SAMPLE_BUSY_TICKS = 3, > }; > > #define I915_PMU_SAMPLE_BITS (4) > @@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample { > #define I915_PMU_ENGINE_BUSY(class, instance) \ > __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) > > +#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \ > + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS) > + > #define I915_PMU_ENGINE_WAIT(class, instance) \ > __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) > > @@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait { > #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) > #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) > #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) > +/* > + * BUSY_STATS is deprecated on platforms with GuC based submission and will nt > + * be available at all on newer platforms. It has accuracy issues due to the > + * conversions from tick counts to wall time. > + * BUSY_TICKS_STATS should be used instead. > + */ > #define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) > +#define I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS (1ul << 5) > /* > * Indicates the 2k user priority levels are statically mapped into 3 buckets as > * follows:
See comments inline below. Zhanjun On 2023-09-22 6:25 p.m., John.C.Harrison@Intel.com wrote: > From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > > In new version of GuC engine busyness, GuC provides engine busyness > ticks as a 64 bit counter. Add a new counter to relay this value to the > user as is. > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > --- > drivers/gpu/drm/i915/gt/intel_engine.h | 1 + > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +++++ > drivers/gpu/drm/i915/gt/intel_engine_types.h | 12 ++++ > drivers/gpu/drm/i915/gt/intel_engine_user.c | 1 + > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++++++++++++++----- > drivers/gpu/drm/i915/i915_pmu.c | 25 ++++++- > drivers/gpu/drm/i915/i915_pmu.h | 2 +- > include/uapi/drm/i915_drm.h | 13 +++- > 8 files changed, 116 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h > index b58c30ac8ef02..57af7ec8ecd82 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head *requests, > > ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, > ktime_t *now); > +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine); > > void intel_engine_get_hung_entity(struct intel_engine_cs *engine, > struct intel_context **ce, struct i915_request **rq); > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 84a75c95f3f7d..1c9ffb1ae9889 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) > return engine->busyness(engine, now); > } > > +/** > + * intel_engine_get_busy_ticks() - Return current accumulated engine busyness > + * ticks > + * @engine: engine to report on > + * > + * Returns accumulated ticks @engine was busy since engine stats were enabled. > + */ > +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine) > +{ > + if (!engine->busyness_ticks || > + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS)) > + return 0; > + > + return engine->busyness_ticks(engine); > +} > + > struct intel_context * > intel_engine_create_virtual(struct intel_engine_cs **siblings, > unsigned int count, unsigned long flags) > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 40fd8f984d64b..a88d40c74d604 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -548,6 +548,11 @@ struct intel_engine_cs { > ktime_t (*busyness)(struct intel_engine_cs *engine, > ktime_t *now); > > + /* > + * Get engine busyness ticks > + */ > + u64 (*busyness_ticks)(struct intel_engine_cs *engine); > + > struct intel_engine_execlists execlists; > > /* > @@ -574,6 +579,7 @@ struct intel_engine_cs { > #define I915_ENGINE_HAS_EU_PRIORITY BIT(10) > #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11) > #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12) > +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13) > unsigned int flags; > > /* > @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs *engine) > return engine->flags & I915_ENGINE_SUPPORTS_STATS; > } > > +static inline bool > +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine) > +{ > + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS; > +} > + > static inline bool > intel_engine_has_preemption(const struct intel_engine_cs *engine) > { > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c > index dcedff41a825f..69eb610b5ab0a 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c > @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private *i915) > MAP(HAS_PREEMPTION, PREEMPTION), > MAP(HAS_SEMAPHORES, SEMAPHORES), > MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), > + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS), > #undef MAP > }; > struct intel_engine_cs *engine; > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > index 0c1fee5360777..71749fb9ad35b 100644 > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > @@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) > guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; > } > > -/* > - * Unlike the execlist mode of submission total and active times are in terms of > - * gt clocks. The *now parameter is retained to return the cpu time at which the > - * busyness was sampled. > - */ > -static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) > +static u64 __busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine, ktime_t *now_out) > { > struct intel_engine_guc_stats_v1 stats_saved, *stats = &engine->stats.guc_v1; > struct i915_gpu_error *gpu_error = &engine->i915->gpu_error; > @@ -1304,6 +1299,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > unsigned long flags; > u32 reset_count; > bool in_reset; > + ktime_t now; > > spin_lock_irqsave(&guc->busy.v1.lock, flags); > > @@ -1318,7 +1314,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > reset_count = i915_reset_count(gpu_error); > in_reset = test_bit(I915_RESET_BACKOFF, >->reset.flags); > > - *now = ktime_get(); > + now = ktime_get(); > > /* > * The active busyness depends on start_gt_clk and gt_stamp. > @@ -1334,7 +1330,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > * start_gt_clk' calculation below for active engines. > */ > busy_v1_guc_update_engine_gt_clks(engine); > - busy_v1_guc_update_pm_timestamp(guc, now); > + busy_v1_guc_update_pm_timestamp(guc, &now); > intel_gt_pm_put_async(gt); > if (i915_reset_count(gpu_error) != reset_count) { > *stats = stats_saved; > @@ -1342,16 +1338,37 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime > } > } > > - total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks); > + total = stats->total_gt_clks; > if (stats->running) { > u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk; > > - total += intel_gt_clock_interval_to_ns(gt, clk); > + total += clk; > } > > spin_unlock_irqrestore(&guc->busy.v1.lock, flags); > > - return ns_to_ktime(total); > + if (now_out) > + *now_out = now; > + > + return total; > +} > + > +/* > + * Unlike the execlist mode of submission total and active times are in terms of > + * gt clocks. The *now parameter is retained to return the cpu time at which the > + * busyness was sampled. > + */ > +static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) > +{ > + u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now); > + u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks); > + > + return ns_to_ktime(ns); > +} > + > +static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine) > +{ > + return __busy_v1_guc_engine_busyness_ticks(engine, NULL); > } > > static void busy_v1_guc_enable_worker(struct intel_guc *guc) > @@ -1607,6 +1624,16 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime > return ns_to_ktime(total); > } > > +static u64 busy_v2_guc_engine_busyness_ticks(struct intel_engine_cs *engine) > +{ > + struct intel_guc *guc = &engine->gt->uc.guc; > + u64 ticks_engine; > + > + __busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, NULL, NULL); > + > + return ticks_engine; > +} > + > static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc) > { > return guc->busy.v1.gt_stamp; > @@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) > engine->emit_flush = gen12_emit_flush_xcs; > } > engine->set_default_submission = guc_set_default_submission; > - if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) > - engine->busyness = busy_v1_guc_engine_busyness; > - else > - engine->busyness = busy_v2_guc_engine_busyness; > + if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) { > + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) > + engine->busyness = busy_v1_guc_engine_busyness; > + engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks; > + } else { > + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) > + engine->busyness = busy_v2_guc_engine_busyness; Do we leave engine->busyness not initialized when >= IP_VER(12, 70)? > + engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks; > + } > + > + if (engine->busyness) > + engine->flags |= I915_ENGINE_SUPPORTS_STATS; > > - engine->flags |= I915_ENGINE_SUPPORTS_STATS; > + engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS; > engine->flags |= I915_ENGINE_HAS_PREEMPTION; > engine->flags |= I915_ENGINE_HAS_TIMESLICES; > > diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c > index 4f52636eb4a80..1b859556644f6 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.c > +++ b/drivers/gpu/drm/i915/i915_pmu.c > @@ -26,7 +26,8 @@ > #define ENGINE_SAMPLE_MASK \ > (BIT(I915_SAMPLE_BUSY) | \ > BIT(I915_SAMPLE_WAIT) | \ > - BIT(I915_SAMPLE_SEMA)) > + BIT(I915_SAMPLE_SEMA) | \ > + BIT(I915_SAMPLE_BUSY_TICKS)) > > static cpumask_t i915_pmu_cpumask; > static unsigned int i915_pmu_target_cpu = -1; > @@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu) > * Also there is software busyness tracking available we do not > * need the timer for I915_SAMPLE_BUSY counter. > */ > - if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) > + if ((i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) || > + (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) { > enable &= ~BIT(I915_SAMPLE_BUSY); > + enable &= ~BIT(I915_SAMPLE_BUSY_TICKS); > + } > > /* > * If some bits remain it means we need the sampling timer running. > @@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs *engine, unsigned int period_ns > add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); > > /* No need to sample when busy stats are supported. */ > - if (intel_engine_supports_stats(engine)) > + if (intel_engine_supports_stats(engine) || > + intel_engine_supports_tick_stats(engine)) > return; > > /* > @@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine, > { > switch (sample) { > case I915_SAMPLE_BUSY: > + if (!intel_engine_supports_stats(engine)) > + return -ENODEV; > + break; > + case I915_SAMPLE_BUSY_TICKS: > + if (!intel_engine_supports_tick_stats(engine)) > + return -ENODEV; > + break; > case I915_SAMPLE_WAIT: > break; > case I915_SAMPLE_SEMA: > @@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > > val = ktime_to_ns(intel_engine_get_busy_time(engine, > &unused)); > + } else if (sample == I915_SAMPLE_BUSY_TICKS && > + intel_engine_supports_tick_stats(engine)) { > + val = intel_engine_get_busy_ticks(engine); > } else { > val = engine->pmu.sample[sample].cur; > } > @@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu) > __engine_event(I915_SAMPLE_BUSY, "busy"), > __engine_event(I915_SAMPLE_SEMA, "sema"), > __engine_event(I915_SAMPLE_WAIT, "wait"), > + __engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"), > }; > unsigned int count = 0; > struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; > @@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu) > engine->uabi_instance, > engine_events[i].sample)); > > + if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS) > + continue; > + > str = kasprintf(GFP_KERNEL, "%s-%s.unit", > engine->name, engine_events[i].name); > if (!str) > diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h > index 41af038c37388..72a9c71944f8d 100644 > --- a/drivers/gpu/drm/i915/i915_pmu.h > +++ b/drivers/gpu/drm/i915/i915_pmu.h > @@ -49,7 +49,7 @@ enum { > (I915_ENGINE_SAMPLE_COUNT + \ > I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT) > > -#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1) > +#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1) > > struct i915_pmu_sample { > u64 cur; > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index e26dd27ff4a5f..8ae98c1bda0ea 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -257,7 +257,8 @@ struct i915_engine_class_instance { > enum drm_i915_pmu_engine_sample { > I915_SAMPLE_BUSY = 0, > I915_SAMPLE_WAIT = 1, > - I915_SAMPLE_SEMA = 2 > + I915_SAMPLE_SEMA = 2, > + I915_SAMPLE_BUSY_TICKS = 3, > }; > > #define I915_PMU_SAMPLE_BITS (4) > @@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample { > #define I915_PMU_ENGINE_BUSY(class, instance) \ > __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) > > +#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \ > + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS) > + > #define I915_PMU_ENGINE_WAIT(class, instance) \ > __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) > > @@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait { > #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) > #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) > #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) > +/* > + * BUSY_STATS is deprecated on platforms with GuC based submission and will nt > + * be available at all on newer platforms. It has accuracy issues due to the > + * conversions from tick counts to wall time. > + * BUSY_TICKS_STATS should be used instead. > + */ > #define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) > +#define I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS (1ul << 5) > /* > * Indicates the 2k user priority levels are statically mapped into 3 buckets as > * follows:
On 10/19/2023 09:21, Dong, Zhanjun wrote: > See comments inline below. > > Zhanjun > > On 2023-09-22 6:25 p.m., John.C.Harrison@Intel.com wrote: >> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >> >> In new version of GuC engine busyness, GuC provides engine busyness >> ticks as a 64 bit counter. Add a new counter to relay this value to the >> user as is. >> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> >> --- >> drivers/gpu/drm/i915/gt/intel_engine.h | 1 + >> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +++++ >> drivers/gpu/drm/i915/gt/intel_engine_types.h | 12 ++++ >> drivers/gpu/drm/i915/gt/intel_engine_user.c | 1 + >> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++++++++++++++----- >> drivers/gpu/drm/i915/i915_pmu.c | 25 ++++++- >> drivers/gpu/drm/i915/i915_pmu.h | 2 +- >> include/uapi/drm/i915_drm.h | 13 +++- >> 8 files changed, 116 insertions(+), 21 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h >> b/drivers/gpu/drm/i915/gt/intel_engine.h >> index b58c30ac8ef02..57af7ec8ecd82 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_engine.h >> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h >> @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct >> list_head *requests, >> ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, >> ktime_t *now); >> +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine); >> void intel_engine_get_hung_entity(struct intel_engine_cs *engine, >> struct intel_context **ce, struct i915_request >> **rq); >> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c >> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c >> index 84a75c95f3f7d..1c9ffb1ae9889 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c >> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c >> @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct >> intel_engine_cs *engine, ktime_t *now) >> return engine->busyness(engine, now); >> } >> +/** >> + * intel_engine_get_busy_ticks() - Return current accumulated engine >> busyness >> + * ticks >> + * @engine: engine to report on >> + * >> + * Returns accumulated ticks @engine was busy since engine stats >> were enabled. >> + */ >> +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine) >> +{ >> + if (!engine->busyness_ticks || >> + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS)) >> + return 0; >> + >> + return engine->busyness_ticks(engine); >> +} >> + >> struct intel_context * >> intel_engine_create_virtual(struct intel_engine_cs **siblings, >> unsigned int count, unsigned long flags) >> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h >> b/drivers/gpu/drm/i915/gt/intel_engine_types.h >> index 40fd8f984d64b..a88d40c74d604 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h >> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h >> @@ -548,6 +548,11 @@ struct intel_engine_cs { >> ktime_t (*busyness)(struct intel_engine_cs *engine, >> ktime_t *now); >> + /* >> + * Get engine busyness ticks >> + */ >> + u64 (*busyness_ticks)(struct intel_engine_cs *engine); >> + >> struct intel_engine_execlists execlists; >> /* >> @@ -574,6 +579,7 @@ struct intel_engine_cs { >> #define I915_ENGINE_HAS_EU_PRIORITY BIT(10) >> #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11) >> #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12) >> +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13) >> unsigned int flags; >> /* >> @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct >> intel_engine_cs *engine) >> return engine->flags & I915_ENGINE_SUPPORTS_STATS; >> } >> +static inline bool >> +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine) >> +{ >> + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS; >> +} >> + >> static inline bool >> intel_engine_has_preemption(const struct intel_engine_cs *engine) >> { >> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c >> b/drivers/gpu/drm/i915/gt/intel_engine_user.c >> index dcedff41a825f..69eb610b5ab0a 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c >> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c >> @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct >> drm_i915_private *i915) >> MAP(HAS_PREEMPTION, PREEMPTION), >> MAP(HAS_SEMAPHORES, SEMAPHORES), >> MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), >> + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS), >> #undef MAP >> }; >> struct intel_engine_cs *engine; >> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c >> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c >> index 0c1fee5360777..71749fb9ad35b 100644 >> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c >> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c >> @@ -1289,12 +1289,7 @@ static void >> busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) >> guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; >> } >> -/* >> - * Unlike the execlist mode of submission total and active times are >> in terms of >> - * gt clocks. The *now parameter is retained to return the cpu time >> at which the >> - * busyness was sampled. >> - */ >> -static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs >> *engine, ktime_t *now) >> +static u64 __busy_v1_guc_engine_busyness_ticks(struct >> intel_engine_cs *engine, ktime_t *now_out) >> { >> struct intel_engine_guc_stats_v1 stats_saved, *stats = >> &engine->stats.guc_v1; >> struct i915_gpu_error *gpu_error = &engine->i915->gpu_error; >> @@ -1304,6 +1299,7 @@ static ktime_t >> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime >> unsigned long flags; >> u32 reset_count; >> bool in_reset; >> + ktime_t now; >> spin_lock_irqsave(&guc->busy.v1.lock, flags); >> @@ -1318,7 +1314,7 @@ static ktime_t >> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime >> reset_count = i915_reset_count(gpu_error); >> in_reset = test_bit(I915_RESET_BACKOFF, >->reset.flags); >> - *now = ktime_get(); >> + now = ktime_get(); >> /* >> * The active busyness depends on start_gt_clk and gt_stamp. >> @@ -1334,7 +1330,7 @@ static ktime_t >> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime >> * start_gt_clk' calculation below for active engines. >> */ >> busy_v1_guc_update_engine_gt_clks(engine); >> - busy_v1_guc_update_pm_timestamp(guc, now); >> + busy_v1_guc_update_pm_timestamp(guc, &now); >> intel_gt_pm_put_async(gt); >> if (i915_reset_count(gpu_error) != reset_count) { >> *stats = stats_saved; >> @@ -1342,16 +1338,37 @@ static ktime_t >> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime >> } >> } >> - total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks); >> + total = stats->total_gt_clks; >> if (stats->running) { >> u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk; >> - total += intel_gt_clock_interval_to_ns(gt, clk); >> + total += clk; >> } >> spin_unlock_irqrestore(&guc->busy.v1.lock, flags); >> - return ns_to_ktime(total); >> + if (now_out) >> + *now_out = now; >> + >> + return total; >> +} >> + >> +/* >> + * Unlike the execlist mode of submission total and active times are >> in terms of >> + * gt clocks. The *now parameter is retained to return the cpu time >> at which the >> + * busyness was sampled. >> + */ >> +static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs >> *engine, ktime_t *now) >> +{ >> + u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now); >> + u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks); >> + >> + return ns_to_ktime(ns); >> +} >> + >> +static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs >> *engine) >> +{ >> + return __busy_v1_guc_engine_busyness_ticks(engine, NULL); >> } >> static void busy_v1_guc_enable_worker(struct intel_guc *guc) >> @@ -1607,6 +1624,16 @@ static ktime_t >> busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime >> return ns_to_ktime(total); >> } >> +static u64 busy_v2_guc_engine_busyness_ticks(struct >> intel_engine_cs *engine) >> +{ >> + struct intel_guc *guc = &engine->gt->uc.guc; >> + u64 ticks_engine; >> + >> + __busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, >> NULL, NULL); >> + >> + return ticks_engine; >> +} >> + >> static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc) >> { >> return guc->busy.v1.gt_stamp; >> @@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct >> intel_engine_cs *engine) >> engine->emit_flush = gen12_emit_flush_xcs; >> } >> engine->set_default_submission = guc_set_default_submission; >> - if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) >> - engine->busyness = busy_v1_guc_engine_busyness; >> - else >> - engine->busyness = busy_v2_guc_engine_busyness; >> + if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) { >> + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) >> + engine->busyness = busy_v1_guc_engine_busyness; >> + engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks; >> + } else { >> + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) >> + engine->busyness = busy_v2_guc_engine_busyness; > > Do we leave engine->busyness not initialized when >= IP_VER(12, 70)? These vfuncs are all zero initialised and checked for null before being called. And the intention was that MTL would not support the old API as it not a good fit for current hardware. However, that plan was rejected and the whole v2 implementation is currently shelved for upstream due to power implications that need to be resolved. John. > >> + engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks; >> + } >> + >> + if (engine->busyness) >> + engine->flags |= I915_ENGINE_SUPPORTS_STATS; >> - engine->flags |= I915_ENGINE_SUPPORTS_STATS; >> + engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS; >> engine->flags |= I915_ENGINE_HAS_PREEMPTION; >> engine->flags |= I915_ENGINE_HAS_TIMESLICES; >> diff --git a/drivers/gpu/drm/i915/i915_pmu.c >> b/drivers/gpu/drm/i915/i915_pmu.c >> index 4f52636eb4a80..1b859556644f6 100644 >> --- a/drivers/gpu/drm/i915/i915_pmu.c >> +++ b/drivers/gpu/drm/i915/i915_pmu.c >> @@ -26,7 +26,8 @@ >> #define ENGINE_SAMPLE_MASK \ >> (BIT(I915_SAMPLE_BUSY) | \ >> BIT(I915_SAMPLE_WAIT) | \ >> - BIT(I915_SAMPLE_SEMA)) >> + BIT(I915_SAMPLE_SEMA) | \ >> + BIT(I915_SAMPLE_BUSY_TICKS)) >> static cpumask_t i915_pmu_cpumask; >> static unsigned int i915_pmu_target_cpu = -1; >> @@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu) >> * Also there is software busyness tracking available we do not >> * need the timer for I915_SAMPLE_BUSY counter. >> */ >> - if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) >> + if ((i915->caps.scheduler & >> I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) || >> + (i915->caps.scheduler & >> I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) { >> enable &= ~BIT(I915_SAMPLE_BUSY); >> + enable &= ~BIT(I915_SAMPLE_BUSY_TICKS); >> + } >> /* >> * If some bits remain it means we need the sampling timer >> running. >> @@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs >> *engine, unsigned int period_ns >> add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); >> /* No need to sample when busy stats are supported. */ >> - if (intel_engine_supports_stats(engine)) >> + if (intel_engine_supports_stats(engine) || >> + intel_engine_supports_tick_stats(engine)) >> return; >> /* >> @@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine, >> { >> switch (sample) { >> case I915_SAMPLE_BUSY: >> + if (!intel_engine_supports_stats(engine)) >> + return -ENODEV; >> + break; >> + case I915_SAMPLE_BUSY_TICKS: >> + if (!intel_engine_supports_tick_stats(engine)) >> + return -ENODEV; >> + break; >> case I915_SAMPLE_WAIT: >> break; >> case I915_SAMPLE_SEMA: >> @@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct >> perf_event *event) >> val = ktime_to_ns(intel_engine_get_busy_time(engine, >> &unused)); >> + } else if (sample == I915_SAMPLE_BUSY_TICKS && >> + intel_engine_supports_tick_stats(engine)) { >> + val = intel_engine_get_busy_ticks(engine); >> } else { >> val = engine->pmu.sample[sample].cur; >> } >> @@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu) >> __engine_event(I915_SAMPLE_BUSY, "busy"), >> __engine_event(I915_SAMPLE_SEMA, "sema"), >> __engine_event(I915_SAMPLE_WAIT, "wait"), >> + __engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"), >> }; >> unsigned int count = 0; >> struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; >> @@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu) >> engine->uabi_instance, >> engine_events[i].sample)); >> + if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS) >> + continue; >> + >> str = kasprintf(GFP_KERNEL, "%s-%s.unit", >> engine->name, engine_events[i].name); >> if (!str) >> diff --git a/drivers/gpu/drm/i915/i915_pmu.h >> b/drivers/gpu/drm/i915/i915_pmu.h >> index 41af038c37388..72a9c71944f8d 100644 >> --- a/drivers/gpu/drm/i915/i915_pmu.h >> +++ b/drivers/gpu/drm/i915/i915_pmu.h >> @@ -49,7 +49,7 @@ enum { >> (I915_ENGINE_SAMPLE_COUNT + \ >> I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT) >> -#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1) >> +#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1) >> struct i915_pmu_sample { >> u64 cur; >> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >> index e26dd27ff4a5f..8ae98c1bda0ea 100644 >> --- a/include/uapi/drm/i915_drm.h >> +++ b/include/uapi/drm/i915_drm.h >> @@ -257,7 +257,8 @@ struct i915_engine_class_instance { >> enum drm_i915_pmu_engine_sample { >> I915_SAMPLE_BUSY = 0, >> I915_SAMPLE_WAIT = 1, >> - I915_SAMPLE_SEMA = 2 >> + I915_SAMPLE_SEMA = 2, >> + I915_SAMPLE_BUSY_TICKS = 3, >> }; >> #define I915_PMU_SAMPLE_BITS (4) >> @@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample { >> #define I915_PMU_ENGINE_BUSY(class, instance) \ >> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) >> +#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \ >> + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS) >> + >> #define I915_PMU_ENGINE_WAIT(class, instance) \ >> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) >> @@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait { >> #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) >> #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) >> #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) >> +/* >> + * BUSY_STATS is deprecated on platforms with GuC based submission >> and will nt >> + * be available at all on newer platforms. It has accuracy issues >> due to the >> + * conversions from tick counts to wall time. >> + * BUSY_TICKS_STATS should be used instead. >> + */ >> #define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) >> +#define I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS (1ul << 5) >> /* >> * Indicates the 2k user priority levels are statically mapped into >> 3 buckets as >> * follows:
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index b58c30ac8ef02..57af7ec8ecd82 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head *requests, ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now); +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine); void intel_engine_get_hung_entity(struct intel_engine_cs *engine, struct intel_context **ce, struct i915_request **rq); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 84a75c95f3f7d..1c9ffb1ae9889 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) return engine->busyness(engine, now); } +/** + * intel_engine_get_busy_ticks() - Return current accumulated engine busyness + * ticks + * @engine: engine to report on + * + * Returns accumulated ticks @engine was busy since engine stats were enabled. + */ +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine) +{ + if (!engine->busyness_ticks || + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS)) + return 0; + + return engine->busyness_ticks(engine); +} + struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 40fd8f984d64b..a88d40c74d604 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -548,6 +548,11 @@ struct intel_engine_cs { ktime_t (*busyness)(struct intel_engine_cs *engine, ktime_t *now); + /* + * Get engine busyness ticks + */ + u64 (*busyness_ticks)(struct intel_engine_cs *engine); + struct intel_engine_execlists execlists; /* @@ -574,6 +579,7 @@ struct intel_engine_cs { #define I915_ENGINE_HAS_EU_PRIORITY BIT(10) #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11) #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12) +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13) unsigned int flags; /* @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_SUPPORTS_STATS; } +static inline bool +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS; +} + static inline bool intel_engine_has_preemption(const struct intel_engine_cs *engine) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c index dcedff41a825f..69eb610b5ab0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private *i915) MAP(HAS_PREEMPTION, PREEMPTION), MAP(HAS_SEMAPHORES, SEMAPHORES), MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS), #undef MAP }; struct intel_engine_cs *engine; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0c1fee5360777..71749fb9ad35b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; } -/* - * Unlike the execlist mode of submission total and active times are in terms of - * gt clocks. The *now parameter is retained to return the cpu time at which the - * busyness was sampled. - */ -static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) +static u64 __busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine, ktime_t *now_out) { struct intel_engine_guc_stats_v1 stats_saved, *stats = &engine->stats.guc_v1; struct i915_gpu_error *gpu_error = &engine->i915->gpu_error; @@ -1304,6 +1299,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime unsigned long flags; u32 reset_count; bool in_reset; + ktime_t now; spin_lock_irqsave(&guc->busy.v1.lock, flags); @@ -1318,7 +1314,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime reset_count = i915_reset_count(gpu_error); in_reset = test_bit(I915_RESET_BACKOFF, >->reset.flags); - *now = ktime_get(); + now = ktime_get(); /* * The active busyness depends on start_gt_clk and gt_stamp. @@ -1334,7 +1330,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime * start_gt_clk' calculation below for active engines. */ busy_v1_guc_update_engine_gt_clks(engine); - busy_v1_guc_update_pm_timestamp(guc, now); + busy_v1_guc_update_pm_timestamp(guc, &now); intel_gt_pm_put_async(gt); if (i915_reset_count(gpu_error) != reset_count) { *stats = stats_saved; @@ -1342,16 +1338,37 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime } } - total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks); + total = stats->total_gt_clks; if (stats->running) { u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk; - total += intel_gt_clock_interval_to_ns(gt, clk); + total += clk; } spin_unlock_irqrestore(&guc->busy.v1.lock, flags); - return ns_to_ktime(total); + if (now_out) + *now_out = now; + + return total; +} + +/* + * Unlike the execlist mode of submission total and active times are in terms of + * gt clocks. The *now parameter is retained to return the cpu time at which the + * busyness was sampled. + */ +static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) +{ + u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now); + u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks); + + return ns_to_ktime(ns); +} + +static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine) +{ + return __busy_v1_guc_engine_busyness_ticks(engine, NULL); } static void busy_v1_guc_enable_worker(struct intel_guc *guc) @@ -1607,6 +1624,16 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime return ns_to_ktime(total); } +static u64 busy_v2_guc_engine_busyness_ticks(struct intel_engine_cs *engine) +{ + struct intel_guc *guc = &engine->gt->uc.guc; + u64 ticks_engine; + + __busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, NULL, NULL); + + return ticks_engine; +} + static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc) { return guc->busy.v1.gt_stamp; @@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) engine->emit_flush = gen12_emit_flush_xcs; } engine->set_default_submission = guc_set_default_submission; - if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) - engine->busyness = busy_v1_guc_engine_busyness; - else - engine->busyness = busy_v2_guc_engine_busyness; + if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) { + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) + engine->busyness = busy_v1_guc_engine_busyness; + engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks; + } else { + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70)) + engine->busyness = busy_v2_guc_engine_busyness; + engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks; + } + + if (engine->busyness) + engine->flags |= I915_ENGINE_SUPPORTS_STATS; - engine->flags |= I915_ENGINE_SUPPORTS_STATS; + engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS; engine->flags |= I915_ENGINE_HAS_PREEMPTION; engine->flags |= I915_ENGINE_HAS_TIMESLICES; diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 4f52636eb4a80..1b859556644f6 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -26,7 +26,8 @@ #define ENGINE_SAMPLE_MASK \ (BIT(I915_SAMPLE_BUSY) | \ BIT(I915_SAMPLE_WAIT) | \ - BIT(I915_SAMPLE_SEMA)) + BIT(I915_SAMPLE_SEMA) | \ + BIT(I915_SAMPLE_BUSY_TICKS)) static cpumask_t i915_pmu_cpumask; static unsigned int i915_pmu_target_cpu = -1; @@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu) * Also there is software busyness tracking available we do not * need the timer for I915_SAMPLE_BUSY counter. */ - if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) + if ((i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) || + (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) { enable &= ~BIT(I915_SAMPLE_BUSY); + enable &= ~BIT(I915_SAMPLE_BUSY_TICKS); + } /* * If some bits remain it means we need the sampling timer running. @@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs *engine, unsigned int period_ns add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); /* No need to sample when busy stats are supported. */ - if (intel_engine_supports_stats(engine)) + if (intel_engine_supports_stats(engine) || + intel_engine_supports_tick_stats(engine)) return; /* @@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine, { switch (sample) { case I915_SAMPLE_BUSY: + if (!intel_engine_supports_stats(engine)) + return -ENODEV; + break; + case I915_SAMPLE_BUSY_TICKS: + if (!intel_engine_supports_tick_stats(engine)) + return -ENODEV; + break; case I915_SAMPLE_WAIT: break; case I915_SAMPLE_SEMA: @@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = ktime_to_ns(intel_engine_get_busy_time(engine, &unused)); + } else if (sample == I915_SAMPLE_BUSY_TICKS && + intel_engine_supports_tick_stats(engine)) { + val = intel_engine_get_busy_ticks(engine); } else { val = engine->pmu.sample[sample].cur; } @@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu) __engine_event(I915_SAMPLE_BUSY, "busy"), __engine_event(I915_SAMPLE_SEMA, "sema"), __engine_event(I915_SAMPLE_WAIT, "wait"), + __engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu) engine->uabi_instance, engine_events[i].sample)); + if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS) + continue; + str = kasprintf(GFP_KERNEL, "%s-%s.unit", engine->name, engine_events[i].name); if (!str) diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 41af038c37388..72a9c71944f8d 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -49,7 +49,7 @@ enum { (I915_ENGINE_SAMPLE_COUNT + \ I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT) -#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1) +#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1) struct i915_pmu_sample { u64 cur; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index e26dd27ff4a5f..8ae98c1bda0ea 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -257,7 +257,8 @@ struct i915_engine_class_instance { enum drm_i915_pmu_engine_sample { I915_SAMPLE_BUSY = 0, I915_SAMPLE_WAIT = 1, - I915_SAMPLE_SEMA = 2 + I915_SAMPLE_SEMA = 2, + I915_SAMPLE_BUSY_TICKS = 3, }; #define I915_PMU_SAMPLE_BITS (4) @@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_ENGINE_BUSY(class, instance) \ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) +#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS) + #define I915_PMU_ENGINE_WAIT(class, instance) \ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) @@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait { #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) +/* + * BUSY_STATS is deprecated on platforms with GuC based submission and will nt + * be available at all on newer platforms. It has accuracy issues due to the + * conversions from tick counts to wall time. + * BUSY_TICKS_STATS should be used instead. + */ #define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) +#define I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS (1ul << 5) /* * Indicates the 2k user priority levels are statically mapped into 3 buckets as * follows: