@@ -386,7 +386,7 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine)
ce->ring = NULL;
ce->ring_size = SZ_4K;
- ewma_runtime_init(&ce->runtime.avg);
+ ewma_runtime_init(&ce->stats.runtime.avg);
ce->vm = i915_vm_get(engine->gt->vm);
@@ -576,6 +576,31 @@ void intel_context_bind_parent_child(struct intel_context *parent,
child->parallel.parent = parent;
}
+u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+{
+ u64 total, active;
+
+ total = ce->stats.runtime.total;
+ if (ce->ops->flags & COPS_RUNTIME_CYCLES)
+ total *= ce->engine->gt->clock_period_ns;
+
+ active = READ_ONCE(ce->stats.active);
+ if (active)
+ active = intel_context_clock() - active;
+
+ return total + active;
+}
+
+u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
+{
+ u64 avg = ewma_runtime_read(&ce->stats.runtime.avg);
+
+ if (ce->ops->flags & COPS_RUNTIME_CYCLES)
+ avg *= ce->engine->gt->clock_period_ns;
+
+ return avg;
+}
+
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_context.c"
#endif
@@ -351,18 +351,13 @@ intel_context_clear_nopreempt(struct intel_context *ce)
clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
}
-static inline u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
-{
- const u32 period = ce->engine->gt->clock_period_ns;
-
- return READ_ONCE(ce->runtime.total) * period;
-}
+u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
-static inline u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
+static inline u64 intel_context_clock(void)
{
- const u32 period = ce->engine->gt->clock_period_ns;
-
- return mul_u32_u32(ewma_runtime_read(&ce->runtime.avg), period);
+ /* As we mix CS cycles with CPU clocks, use the raw monotonic clock. */
+ return ktime_get_raw_fast_ns();
}
#endif /* __INTEL_CONTEXT_H__ */
@@ -35,6 +35,9 @@ struct intel_context_ops {
#define COPS_HAS_INFLIGHT_BIT 0
#define COPS_HAS_INFLIGHT BIT(COPS_HAS_INFLIGHT_BIT)
+#define COPS_RUNTIME_CYCLES_BIT 1
+#define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
+
int (*alloc)(struct intel_context *ce);
void (*ban)(struct intel_context *ce, struct i915_request *rq);
@@ -134,14 +137,19 @@ struct intel_context {
} lrc;
u32 tag; /* cookie passed to HW to track this context on submission */
- /* Time on GPU as tracked by the hw. */
- struct {
- struct ewma_runtime avg;
- u64 total;
- u32 last;
- I915_SELFTEST_DECLARE(u32 num_underflow);
- I915_SELFTEST_DECLARE(u32 max_underflow);
- } runtime;
+ /** stats: Context GPU engine busyness tracking. */
+ struct intel_context_stats {
+ u64 active;
+
+ /* Time on GPU as tracked by the hw. */
+ struct {
+ struct ewma_runtime avg;
+ u64 total;
+ u32 last;
+ I915_SELFTEST_DECLARE(u32 num_underflow);
+ I915_SELFTEST_DECLARE(u32 max_underflow);
+ } runtime;
+ } stats;
unsigned int active_count; /* protected by timeline->mutex */
@@ -625,8 +625,6 @@ static void __execlists_schedule_out(struct i915_request * const rq,
GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
__set_bit(ccid - 1, &engine->context_tag);
}
-
- lrc_update_runtime(ce);
intel_engine_context_out(engine);
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
if (engine->fw_domain && !--engine->fw_active)
@@ -2005,8 +2003,23 @@ process_csb(struct intel_engine_cs *engine, struct i915_request **inactive)
* and merits a fresh timeslice. We reinstall the timer after
* inspecting the queue to see if we need to resumbit.
*/
- if (*prev != *execlists->active) /* elide lite-restores */
+ if (*prev != *execlists->active) { /* elide lite-restores */
+ /*
+ * Note the inherent discrepancy between the HW runtime,
+ * recorded as part of the context switch, and the CPU
+ * adjustment for active contexts. We have to hope that
+ * the delay in processing the CS event is very small
+ * and consistent. It works to our advantage to have
+ * the CPU adjustment _undershoot_ (i.e. start later than)
+ * the CS timestamp so we never overreport the runtime
+ * and correct overselves later when updating from HW.
+ */
+ if (*prev)
+ lrc_runtime_stop((*prev)->context);
+ if (*execlists->active)
+ lrc_runtime_start((*execlists->active)->context);
new_timeslice(execlists);
+ }
return inactive;
}
@@ -2638,7 +2651,7 @@ execlists_create_parallel(struct intel_engine_cs **engines,
}
static const struct intel_context_ops execlists_context_ops = {
- .flags = COPS_HAS_INFLIGHT,
+ .flags = COPS_HAS_INFLIGHT | COPS_RUNTIME_CYCLES,
.alloc = execlists_context_alloc,
@@ -3696,7 +3709,7 @@ virtual_get_sibling(struct intel_engine_cs *engine, unsigned int sibling)
}
static const struct intel_context_ops virtual_context_ops = {
- .flags = COPS_HAS_INFLIGHT,
+ .flags = COPS_HAS_INFLIGHT | COPS_RUNTIME_CYCLES,
.alloc = virtual_context_alloc,
@@ -161,6 +161,10 @@ void intel_gt_init_clock_frequency(struct intel_gt *gt)
if (gt->clock_frequency)
gt->clock_period_ns = intel_gt_clock_interval_to_ns(gt, 1);
+ /* Icelake appears to use another fixed frequency for CTX_TIMESTAMP */
+ if (GRAPHICS_VER(gt->i915) == 11)
+ gt->clock_period_ns = NSEC_PER_SEC / 13750000;
+
GT_TRACE(gt,
"Using clock frequency: %dkHz, period: %dns, wrap: %lldms\n",
gt->clock_frequency / 1000,
@@ -778,7 +778,7 @@ static void init_common_regs(u32 * const regs,
CTX_CTRL_RS_CTX_ENABLE);
regs[CTX_CONTEXT_CONTROL] = ctl;
- regs[CTX_TIMESTAMP] = ce->runtime.last;
+ regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
}
static void init_wa_bb_regs(u32 * const regs,
@@ -1734,11 +1734,12 @@ void lrc_init_wa_ctx(struct intel_engine_cs *engine)
}
}
-static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
+static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
{
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
- ce->runtime.num_underflow++;
- ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
+ stats->runtime.num_underflow++;
+ stats->runtime.max_underflow =
+ max_t(u32, stats->runtime.max_underflow, -dt);
#endif
}
@@ -1755,25 +1756,25 @@ static u32 lrc_get_runtime(const struct intel_context *ce)
void lrc_update_runtime(struct intel_context *ce)
{
+ struct intel_context_stats *stats = &ce->stats;
u32 old;
s32 dt;
- if (intel_context_is_barrier(ce))
+ old = stats->runtime.last;
+ stats->runtime.last = lrc_get_runtime(ce);
+ dt = stats->runtime.last - old;
+ if (!dt)
return;
- old = ce->runtime.last;
- ce->runtime.last = lrc_get_runtime(ce);
- dt = ce->runtime.last - old;
-
if (unlikely(dt < 0)) {
CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
- old, ce->runtime.last, dt);
- st_update_runtime_underflow(ce, dt);
+ old, stats->runtime.last, dt);
+ st_runtime_underflow(stats, dt);
return;
}
- ewma_runtime_add(&ce->runtime.avg, dt);
- ce->runtime.total += dt;
+ ewma_runtime_add(&stats->runtime.avg, dt);
+ stats->runtime.total += dt;
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
@@ -11,9 +11,10 @@
#include <linux/bitfield.h>
#include <linux/types.h>
+#include "intel_context.h"
+
struct drm_i915_gem_object;
struct i915_gem_ww_ctx;
-struct intel_context;
struct intel_engine_cs;
struct intel_ring;
struct kref;
@@ -120,4 +121,28 @@ static inline u32 lrc_desc_priority(int prio)
return GEN12_CTX_PRIORITY_NORMAL;
}
+static inline void lrc_runtime_start(struct intel_context *ce)
+{
+ struct intel_context_stats *stats = &ce->stats;
+
+ if (intel_context_is_barrier(ce))
+ return;
+
+ if (stats->active)
+ return;
+
+ WRITE_ONCE(stats->active, intel_context_clock());
+}
+
+static inline void lrc_runtime_stop(struct intel_context *ce)
+{
+ struct intel_context_stats *stats = &ce->stats;
+
+ if (!stats->active)
+ return;
+
+ lrc_update_runtime(ce);
+ WRITE_ONCE(stats->active, 0);
+}
+
#endif /* __INTEL_LRC_H__ */
@@ -1753,8 +1753,8 @@ static int __live_pphwsp_runtime(struct intel_engine_cs *engine)
if (IS_ERR(ce))
return PTR_ERR(ce);
- ce->runtime.num_underflow = 0;
- ce->runtime.max_underflow = 0;
+ ce->stats.runtime.num_underflow = 0;
+ ce->stats.runtime.max_underflow = 0;
do {
unsigned int loop = 1024;
@@ -1792,11 +1792,11 @@ static int __live_pphwsp_runtime(struct intel_engine_cs *engine)
intel_context_get_avg_runtime_ns(ce));
err = 0;
- if (ce->runtime.num_underflow) {
+ if (ce->stats.runtime.num_underflow) {
pr_err("%s: pphwsp underflow %u time(s), max %u cycles!\n",
engine->name,
- ce->runtime.num_underflow,
- ce->runtime.max_underflow);
+ ce->stats.runtime.num_underflow,
+ ce->stats.runtime.max_underflow);
GEM_TRACE_DUMP();
err = -EOVERFLOW;
}
@@ -512,13 +512,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
const char *header,
const struct i915_gem_context_coredump *ctx)
{
- const u32 period = to_gt(m->i915)->clock_period_ns;
-
err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
ctx->guilty, ctx->active,
- ctx->total_runtime * period,
- mul_u32_u32(ctx->avg_runtime, period));
+ ctx->total_runtime, ctx->avg_runtime);
}
static struct i915_vma_coredump *
@@ -1367,8 +1364,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
e->guilty = atomic_read(&ctx->guilty_count);
e->active = atomic_read(&ctx->active_count);
- e->total_runtime = rq->context->runtime.total;
- e->avg_runtime = ewma_runtime_read(&rq->context->runtime.avg);
+ e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
+ e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
simulated = i915_gem_context_no_error_capture(ctx);
@@ -94,7 +94,7 @@ struct intel_engine_coredump {
char comm[TASK_COMM_LEN];
u64 total_runtime;
- u32 avg_runtime;
+ u64 avg_runtime;
pid_t pid;
int active;