From patchwork Fri Sep 22 22:25:08 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: John Harrison <John.C.Harrison@Intel.com>
X-Patchwork-Id: 13396419
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 1EB2BCE7A81
	for <intel-gfx@archiver.kernel.org>; Fri, 22 Sep 2023 22:26:28 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 878FE10E721;
	Fri, 22 Sep 2023 22:26:13 +0000 (UTC)
Received: from mgamail.intel.com (mgamail.intel.com [192.55.52.120])
 by gabe.freedesktop.org (Postfix) with ESMTPS id DB54F10E71C;
 Fri, 22 Sep 2023 22:26:08 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
 d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
 t=1695421568; x=1726957568;
 h=from:to:cc:subject:date:message-id:in-reply-to:
 references:mime-version:content-transfer-encoding;
 bh=LCANBnjaAoswsUKUrwhcqiP8H4hnrjumTy0L9PFJxYw=;
 b=lnphG5q8NWMNGBnVrRPtNoz/TYemh3wayQQfdx4P8zyvYPwDMIvjliM6
 0sc+k+QLi9ejxtrA+JeDt5xUwmJ7PGF5EYc2KuJuqw3q8kFIPEjVEbq7g
 q9Vm4ruXcOKGQGHjxR/4xdR4eS7cPvaDMZ+yBAI1sK4ppQATU7RfHuJfi
 7bmEFaPtkQMHbHfQggyKQDD6/cnMYg/cdQa0AcU12zcEflt6RZ/nF4Ihf
 ZcaGdsWxfRku1A76PM0INp9/y4eO18XsZ+TIFV7I6PJa97CP4nvKqxI0K
 A+OOqEThjTDbJrMIdf7xGaPKUBdlfn9VZc8Xt5E7B1KYnG+xL0GwsDlX6 g==;
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="379836616"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="379836616"
Received: from fmsmga005.fm.intel.com ([10.253.24.32])
 by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 22 Sep 2023 15:25:57 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="1078549552"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="1078549552"
Received: from relo-linux-5.jf.intel.com ([10.165.21.152])
 by fmsmga005.fm.intel.com with ESMTP; 22 Sep 2023 15:25:56 -0700
From: John.C.Harrison@Intel.com
To: Intel-GFX@Lists.FreeDesktop.Org
Date: Fri, 22 Sep 2023 15:25:08 -0700
Message-ID: <20230922222510.2235213-2-John.C.Harrison@Intel.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
References: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
MIME-Version: 1.0
Organization: Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way,
 Swindon SN3 1RJ
Subject: [Intel-gfx] [PATCH 1/3] drm/i915/guc: Support new and improved
 engine busyness
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

From: John Harrison <John.C.Harrison@Intel.com>

The GuC has been extended to support a much more friendly engine
busyness interface. So partition the old interface into a 'busy_v1'
space and add 'busy_v2' support alongside. And if v2 is available, use
that in preference to v1. Note that v2 provides extra features over
and above v1 which will be exposed via PMU in subsequent patches.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  82 ++--
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  55 ++-
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   9 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++++++++++++++----
 7 files changed, 427 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a7e6775980043..40fd8f984d64b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
-struct intel_engine_guc_stats {
+struct intel_engine_guc_stats_v1 {
 	/**
 	 * @running: Active state of the engine when busyness was last sampled.
 	 */
@@ -603,7 +603,7 @@ struct intel_engine_cs {
 	struct {
 		union {
 			struct intel_engine_execlists_stats execlists;
-			struct intel_engine_guc_stats guc;
+			struct intel_engine_guc_stats_v1 guc_v1;
 		};
 
 		/**
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index f359bef046e0b..c190a99a36c38 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -137,7 +137,9 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
 	INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
-	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
+	INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
+	INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
 	INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
 	INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
 	INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 6c392bad29c19..e6502ab5f049f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -226,45 +226,61 @@ struct intel_guc {
 	struct mutex send_mutex;
 
 	/**
-	 * @timestamp: GT timestamp object that stores a copy of the timestamp
-	 * and adjusts it for overflow using a worker.
+	 * @busy: Data used by the different versions of engine busyness implementations.
 	 */
-	struct {
-		/**
-		 * @lock: Lock protecting the below fields and the engine stats.
-		 */
-		spinlock_t lock;
-
-		/**
-		 * @gt_stamp: 64 bit extended value of the GT timestamp.
-		 */
-		u64 gt_stamp;
-
-		/**
-		 * @ping_delay: Period for polling the GT timestamp for
-		 * overflow.
-		 */
-		unsigned long ping_delay;
-
-		/**
-		 * @work: Periodic work to adjust GT timestamp, engine and
-		 * context usage for overflows.
-		 */
-		struct delayed_work work;
-
+	union {
 		/**
-		 * @shift: Right shift value for the gpm timestamp
+		 * @v1: Data used by v1 engine busyness implementation. Mostly a copy
+		 * of the GT timestamp extended to 64 bits and the worker for maintaining it.
 		 */
-		u32 shift;
+		struct {
+			/**
+			 * @lock: Lock protecting the below fields and the engine stats.
+			 */
+			spinlock_t lock;
+
+			/**
+			 * @gt_stamp: 64 bit extended value of the GT timestamp.
+			 */
+			u64 gt_stamp;
+
+			/**
+			 * @ping_delay: Period for polling the GT timestamp for
+			 * overflow.
+			 */
+			unsigned long ping_delay;
+
+			/**
+			 * @work: Periodic work to adjust GT timestamp, engine and
+			 * context usage for overflows.
+			 */
+			struct delayed_work work;
+
+			/**
+			 * @shift: Right shift value for the gpm timestamp
+			 */
+			u32 shift;
+
+			/**
+			 * @last_stat_jiffies: jiffies at last actual stats collection time
+			 * We use this timestamp to ensure we don't oversample the
+			 * stats because runtime power management events can trigger
+			 * stats collection at much higher rates than required.
+			 */
+			unsigned long last_stat_jiffies;
+		} v1;
 
 		/**
-		 * @last_stat_jiffies: jiffies at last actual stats collection time
-		 * We use this timestamp to ensure we don't oversample the
-		 * stats because runtime power management events can trigger
-		 * stats collection at much higher rates than required.
+		 * @v2: Data used by v2 engine busyness implementation - a memory object
+		 * that is filled in by the GuC and read by the driver.
 		 */
-		unsigned long last_stat_jiffies;
-	} timestamp;
+		struct {
+			/** @device_vma: object allocated to hold the device level busyness data */
+			struct i915_vma *device_vma;
+			/** @device_map: access object for @device_vma */
+			struct iosys_map device_map;
+		} v2;
+	} busy;
 
 	/**
 	 * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 63724e17829a7..1ce595d6816f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -59,7 +59,10 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
-	struct guc_engine_usage engine_usage;
+	union {
+		struct guc_engine_usage v1;
+		struct guc_function_observation_data v2;
+	} engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[];
 } __packed;
@@ -948,18 +951,62 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 	guc_ads_private_data_reset(guc);
 }
 
-u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc)
 {
 	return intel_guc_ggtt_offset(guc, guc->ads_vma) +
 		offsetof(struct __guc_ads_blob, engine_usage);
 }
 
-struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine)
+struct iosys_map intel_guc_engine_usage_record_map_v1(struct intel_engine_cs *engine)
 {
 	struct intel_guc *guc = &engine->gt->uc.guc;
 	u8 guc_class = engine_class_to_guc_class(engine->class);
 	size_t offset = offsetof(struct __guc_ads_blob,
-				 engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
+				 engine_usage.v1.engines[guc_class][ilog2(engine->logical_mask)]);
 
 	return IOSYS_MAP_INIT_OFFSET(&guc->ads_map, offset);
 }
+
+int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
+					 struct intel_engine_cs *engine,
+					 u32 guc_vf,
+					 struct iosys_map *engine_map,
+					 struct iosys_map *global_map)
+{
+	size_t offset_global, offset_engine;
+	struct iosys_map *map;
+	u32 instance;
+	u8 guc_class;
+
+	if (engine) {
+		guc_class = engine_class_to_guc_class(engine->class);
+		instance = ilog2(engine->logical_mask);
+	}
+
+	if (guc_vf >= GUC_MAX_VF_COUNT) {
+		if (guc_vf != ~0U) {
+			guc_err(guc, "Out of range VF in busyness query: 0x%X\n", guc_vf);
+			return -EINVAL;
+		}
+
+		map = &guc->busy.v2.device_map;
+		offset_global = 0;
+
+		if (engine)
+			offset_engine = offsetof(struct guc_engine_observation_data,
+						 engine_data[guc_class][instance]);
+	} else {
+		map = &guc->ads_map;
+		offset_global = offsetof(struct __guc_ads_blob,
+					 engine_usage.v2.function_data[guc_vf]);
+		if (engine)
+			offset_engine = offsetof(struct __guc_ads_blob,
+						 engine_usage.v2.function_data[guc_vf].engine_data[guc_class][instance]);
+	}
+
+	*global_map = IOSYS_MAP_INIT_OFFSET(map, offset_global);
+	if (engine)
+		*engine_map = IOSYS_MAP_INIT_OFFSET(map, offset_engine);
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 1c64f4d6ea21e..75c9916c96ed4 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -19,7 +19,12 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
-struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine);
-u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
+struct iosys_map intel_guc_engine_usage_record_map_v1(struct intel_engine_cs *engine);
+int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
+					 struct intel_engine_cs *engine,
+					 u32 vf_idx,
+					 struct iosys_map *engine_map,
+					 struct iosys_map *global_map);
+u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index b4d56eccfb1f0..91bbda8cf5370 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -44,9 +44,12 @@
 #define GUC_LAST_ENGINE_CLASS		GUC_GSC_OTHER_CLASS
 #define GUC_MAX_ENGINE_CLASSES		16
 #define GUC_MAX_INSTANCES_PER_CLASS	32
+#define GUC_MAX_OAG_COUNTERS		8
 
 #define GUC_DOORBELL_INVALID		256
 
+#define GUC_MAX_VF_COUNT		64
+
 /*
  * Work queue item header definitions
  *
@@ -431,7 +434,7 @@ struct guc_ads {
 	u32 reserved[14];
 } __packed;
 
-/* Engine usage stats */
+/* Engine usage stats - v1 */
 struct guc_engine_usage_record {
 	u32 current_context_index;
 	u32 last_switch_in_stamp;
@@ -444,6 +447,24 @@ struct guc_engine_usage {
 	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
 } __packed;
 
+/* Engine usage stats - v2 */
+struct guc_engine_data {
+	u64 total_execution_ticks;
+	u64 reserved;
+} __packed;
+
+struct guc_engine_observation_data {
+	struct guc_engine_data engine_data[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+	u64 oag_busy_data[GUC_MAX_OAG_COUNTERS];
+	u64 total_active_ticks;
+	u64 gt_timestamp;
+	u64 reserved1;
+} __packed;
+
+struct guc_function_observation_data {
+	struct guc_engine_observation_data function_data[GUC_MAX_VF_COUNT];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index cabdc645fcddb..88465d701c278 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1139,7 +1139,7 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 }
 
 /*
- * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * GuC < 70.11.1 stores busyness stats for each engine at context in/out boundaries. A
  * context 'in' logs execution start time, 'out' adds in -> out delta to total.
  * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
  * GuC.
@@ -1161,23 +1161,23 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
  * 27 seconds for a gt clock frequency of 19.2 MHz).
  */
 
-#define WRAP_TIME_CLKS U32_MAX
-#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+#define BUSY_V1_WRAP_TIME_CLKS U32_MAX
+#define BUSY_V1_POLL_TIME_CLKS (BUSY_V1_WRAP_TIME_CLKS >> 3)
 
 static void
-__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+__busy_v1_extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
 {
-	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
-	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_hi = upper_32_bits(guc->busy.v1.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->busy.v1.gt_stamp);
 
 	if (new_start == lower_32_bits(*prev_start))
 		return;
 
 	/*
 	 * When gt is unparked, we update the gt timestamp and start the ping
-	 * worker that updates the gt_stamp every POLL_TIME_CLKS. As long as gt
+	 * worker that updates the gt_stamp every BUSY_V1_POLL_TIME_CLKS. As long as gt
 	 * is unparked, all switched in contexts will have a start time that is
-	 * within +/- POLL_TIME_CLKS of the most recent gt_stamp.
+	 * within +/- BUSY_V1_POLL_TIME_CLKS of the most recent gt_stamp.
 	 *
 	 * If neither gt_stamp nor new_start has rolled over, then the
 	 * gt_stamp_hi does not need to be adjusted, however if one of them has
@@ -1187,19 +1187,16 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
 	 * gt_stamp_last rollover respectively.
 	 */
 	if (new_start < gt_stamp_last &&
-	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+	    (new_start - gt_stamp_last) <= BUSY_V1_POLL_TIME_CLKS)
 		gt_stamp_hi++;
 
 	if (new_start > gt_stamp_last &&
-	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+	    (gt_stamp_last - new_start) <= BUSY_V1_POLL_TIME_CLKS && gt_stamp_hi)
 		gt_stamp_hi--;
 
 	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
 }
 
-#define record_read(map_, field_) \
-	iosys_map_rd_field(map_, 0, struct guc_engine_usage_record, field_)
-
 /*
  * GuC updates shared memory and KMD reads it. Since this is not synchronized,
  * we run into a race where the value read is inconsistent. Sometimes the
@@ -1211,12 +1208,15 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
  * values. The upper bound is set to 6 attempts and may need to be tuned as per
  * any new occurences.
  */
-static void __get_engine_usage_record(struct intel_engine_cs *engine,
-				      u32 *last_in, u32 *id, u32 *total)
+static void __busy_v1_get_engine_usage_record(struct intel_engine_cs *engine,
+					      u32 *last_in, u32 *id, u32 *total)
 {
-	struct iosys_map rec_map = intel_guc_engine_usage_record_map(engine);
+	struct iosys_map rec_map = intel_guc_engine_usage_record_map_v1(engine);
 	int i = 0;
 
+#define record_read(map_, field_) \
+	iosys_map_rd_field(map_, 0, struct guc_engine_usage_record, field_)
+
 	do {
 		*last_in = record_read(&rec_map, last_switch_in_stamp);
 		*id = record_read(&rec_map, current_context_index);
@@ -1227,21 +1227,23 @@ static void __get_engine_usage_record(struct intel_engine_cs *engine,
 		    record_read(&rec_map, total_runtime) == *total)
 			break;
 	} while (++i < 6);
+
+#undef record_read
 }
 
-static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+static void busy_v1_guc_update_engine_gt_clks(struct intel_engine_cs *engine)
 {
-	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_engine_guc_stats_v1 *stats = &engine->stats.guc_v1;
 	struct intel_guc *guc = &engine->gt->uc.guc;
 	u32 last_switch, ctx_id, total;
 
-	lockdep_assert_held(&guc->timestamp.lock);
+	lockdep_assert_held(&guc->busy.v1.lock);
 
-	__get_engine_usage_record(engine, &last_switch, &ctx_id, &total);
+	__busy_v1_get_engine_usage_record(engine, &last_switch, &ctx_id, &total);
 
 	stats->running = ctx_id != ~0U && last_switch;
 	if (stats->running)
-		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+		__busy_v1_extend_last_switch(guc, &stats->start_gt_clk, last_switch);
 
 	/*
 	 * Instead of adjusting the total for overflow, just add the
@@ -1253,7 +1255,7 @@ static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
 	}
 }
 
-static u32 gpm_timestamp_shift(struct intel_gt *gt)
+static u32 busy_v1_gpm_timestamp_shift(struct intel_gt *gt)
 {
 	intel_wakeref_t wakeref;
 	u32 reg, shift;
@@ -1267,24 +1269,24 @@ static u32 gpm_timestamp_shift(struct intel_gt *gt)
 	return 3 - shift;
 }
 
-static void guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)
+static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)
 {
 	struct intel_gt *gt = guc_to_gt(guc);
 	u32 gt_stamp_lo, gt_stamp_hi;
 	u64 gpm_ts;
 
-	lockdep_assert_held(&guc->timestamp.lock);
+	lockdep_assert_held(&guc->busy.v1.lock);
 
-	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_hi = upper_32_bits(guc->busy.v1.gt_stamp);
 	gpm_ts = intel_uncore_read64_2x32(gt->uncore, MISC_STATUS0,
-					  MISC_STATUS1) >> guc->timestamp.shift;
+					  MISC_STATUS1) >> guc->busy.v1.shift;
 	gt_stamp_lo = lower_32_bits(gpm_ts);
 	*now = ktime_get();
 
-	if (gt_stamp_lo < lower_32_bits(guc->timestamp.gt_stamp))
+	if (gt_stamp_lo < lower_32_bits(guc->busy.v1.gt_stamp))
 		gt_stamp_hi++;
 
-	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
+	guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
 }
 
 /*
@@ -1292,9 +1294,9 @@ static void guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)
  * gt clocks. The *now parameter is retained to return the cpu time at which the
  * busyness was sampled.
  */
-static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_guc_stats stats_saved, *stats = &engine->stats.guc;
+	struct intel_engine_guc_stats_v1 stats_saved, *stats = &engine->stats.guc_v1;
 	struct i915_gpu_error *gpu_error = &engine->i915->gpu_error;
 	struct intel_gt *gt = engine->gt;
 	struct intel_guc *guc = &gt->uc.guc;
@@ -1303,7 +1305,7 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
 	u32 reset_count;
 	bool in_reset;
 
-	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	spin_lock_irqsave(&guc->busy.v1.lock, flags);
 
 	/*
 	 * If a reset happened, we risk reading partially updated engine
@@ -1326,43 +1328,43 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
 	 */
 	if (!in_reset && intel_gt_pm_get_if_awake(gt)) {
 		stats_saved = *stats;
-		gt_stamp_saved = guc->timestamp.gt_stamp;
+		gt_stamp_saved = guc->busy.v1.gt_stamp;
 		/*
 		 * Update gt_clks, then gt timestamp to simplify the 'gt_stamp -
 		 * start_gt_clk' calculation below for active engines.
 		 */
-		guc_update_engine_gt_clks(engine);
-		guc_update_pm_timestamp(guc, now);
+		busy_v1_guc_update_engine_gt_clks(engine);
+		busy_v1_guc_update_pm_timestamp(guc, now);
 		intel_gt_pm_put_async(gt);
 		if (i915_reset_count(gpu_error) != reset_count) {
 			*stats = stats_saved;
-			guc->timestamp.gt_stamp = gt_stamp_saved;
+			guc->busy.v1.gt_stamp = gt_stamp_saved;
 		}
 	}
 
 	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
 	if (stats->running) {
-		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+		u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk;
 
 		total += intel_gt_clock_interval_to_ns(gt, clk);
 	}
 
-	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
 
 	return ns_to_ktime(total);
 }
 
-static void guc_enable_busyness_worker(struct intel_guc *guc)
+static void busy_v1_guc_enable_worker(struct intel_guc *guc)
 {
-	mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay);
+	mod_delayed_work(system_highpri_wq, &guc->busy.v1.work, guc->busy.v1.ping_delay);
 }
 
-static void guc_cancel_busyness_worker(struct intel_guc *guc)
+static void busy_v1_guc_cancel_worker(struct intel_guc *guc)
 {
-	cancel_delayed_work_sync(&guc->timestamp.work);
+	cancel_delayed_work_sync(&guc->busy.v1.work);
 }
 
-static void __reset_guc_busyness_stats(struct intel_guc *guc)
+static void __busy_v1_reset_guc_busyness_stats(struct intel_guc *guc)
 {
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_engine_cs *engine;
@@ -1370,20 +1372,20 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc)
 	unsigned long flags;
 	ktime_t unused;
 
-	guc_cancel_busyness_worker(guc);
+	busy_v1_guc_cancel_worker(guc);
 
-	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	spin_lock_irqsave(&guc->busy.v1.lock, flags);
 
-	guc_update_pm_timestamp(guc, &unused);
+	busy_v1_guc_update_pm_timestamp(guc, &unused);
 	for_each_engine(engine, gt, id) {
-		guc_update_engine_gt_clks(engine);
-		engine->stats.guc.prev_total = 0;
+		busy_v1_guc_update_engine_gt_clks(engine);
+		engine->stats.guc_v1.prev_total = 0;
 	}
 
-	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
 }
 
-static void __update_guc_busyness_stats(struct intel_guc *guc)
+static void __busy_v1_update_guc_busyness_stats(struct intel_guc *guc)
 {
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_engine_cs *engine;
@@ -1391,25 +1393,34 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
 	unsigned long flags;
 	ktime_t unused;
 
-	guc->timestamp.last_stat_jiffies = jiffies;
+	guc->busy.v1.last_stat_jiffies = jiffies;
 
-	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	spin_lock_irqsave(&guc->busy.v1.lock, flags);
 
-	guc_update_pm_timestamp(guc, &unused);
+	busy_v1_guc_update_pm_timestamp(guc, &unused);
 	for_each_engine(engine, gt, id)
-		guc_update_engine_gt_clks(engine);
+		busy_v1_guc_update_engine_gt_clks(engine);
 
-	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
 }
 
-static void __guc_context_update_stats(struct intel_context *ce)
+static void __busy_v1_guc_context_update_stats(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 	unsigned long flags;
 
-	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	spin_lock_irqsave(&guc->busy.v1.lock, flags);
+	lrc_update_runtime(ce);
+	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
+}
+
+static void __busy_v2_guc_context_update_stats(struct intel_context *ce)
+{
+	/*
+	 * Need to ping periodically?
+	 * Won't work for long running though, because relies on a context switch?!
+	 */
 	lrc_update_runtime(ce);
-	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }
 
 static void guc_context_update_stats(struct intel_context *ce)
@@ -1417,14 +1428,18 @@ static void guc_context_update_stats(struct intel_context *ce)
 	if (!intel_context_pin_if_active(ce))
 		return;
 
-	__guc_context_update_stats(ce);
+	if (GUC_SUBMIT_VER(ce_to_guc(ce)) < MAKE_GUC_VER(1, 3, 1))
+		__busy_v1_guc_context_update_stats(ce);
+	else
+		__busy_v2_guc_context_update_stats(ce);
+
 	intel_context_unpin(ce);
 }
 
-static void guc_timestamp_ping(struct work_struct *wrk)
+static void busy_v1_guc_timestamp_ping(struct work_struct *wrk)
 {
 	struct intel_guc *guc = container_of(wrk, typeof(*guc),
-					     timestamp.work.work);
+					     busy.v1.work.work);
 	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_context *ce;
@@ -1443,7 +1458,7 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 		return;
 
 	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
-		__update_guc_busyness_stats(guc);
+		__busy_v1_update_guc_busyness_stats(guc);
 
 	/* adjust context stats for overflow */
 	xa_for_each(&guc->context_lookup, index, ce)
@@ -1451,14 +1466,164 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 
 	intel_gt_reset_unlock(gt, srcu);
 
-	guc_enable_busyness_worker(guc);
+	busy_v1_guc_enable_worker(guc);
 }
 
-static int guc_action_enable_usage_stats(struct intel_guc *guc)
+static int busy_v1_guc_action_enable_usage_stats(struct intel_guc *guc)
 {
-	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 offset = intel_guc_engine_usage_offset_pf(guc);
 	u32 action[] = {
-		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+/*
+ * GuC >= 70.11.1 maintains busyness counters in a shared memory buffer for each
+ * engine on a continuous basis. The counters are all 64bits and count in clock
+ * ticks. The values are updated on context switch events and periodically on a
+ * timer internal to GuC. The update rate is guaranteed to be at least 2Hz (but
+ * with the caveat that GuC is not a real-time OS so best effort only).
+ *
+ * In addition to an engine active time count, there is also a total time count.
+ * For native, this is only a free-running GT timestamp counter. For PF/VF,
+ * there is also a function active counter - how many ticks the VF or PF has had
+ * available for execution.
+ *
+ * Note that the counters should only be used as ratios of each other for
+ * a calculating a percentage. No guarantees are made about frequencies for
+ * conversions to wall time, etc.
+ *
+ * ticks_engine:   clock ticks for which engine was active
+ * ticks_function: clock ticks owned by this VF
+ * ticks_gt:       total clock ticks
+ *
+ * native engine busyness: ticks_engine / ticks_gt
+ * VF/PF engine busyness:  ticks_engine / ticks_function
+ * VF/PF engine ownership: ticks_function / ticks_gt
+ */
+
+static u32 guc_engine_usage_offset_v2_device(struct intel_guc *guc)
+{
+	return intel_guc_ggtt_offset(guc, guc->busy.v2.device_vma);
+}
+
+static int guc_busy_v2_alloc_device(struct intel_guc *guc)
+{
+	size_t size = sizeof(struct guc_engine_observation_data);
+	void *busy_v2_ptr;
+	int ret;
+
+	/*
+	 * When I915_WA_FORCE_SMEM_OBJECT is enabled we normally create objects
+	 * in SMEM but guc_ads is not accessed by the host and it has
+	 * requirement that physical pages are contiguous in memory for this
+	 * vma. Hence always create guc_ads object in LMEM.
+	 */
+	ret = intel_guc_allocate_and_map_vma(guc, size, &guc->busy.v2.device_vma, &busy_v2_ptr);
+	if (ret)
+		return ret;
+
+	if (i915_gem_object_is_lmem(guc->busy.v2.device_vma->obj))
+		iosys_map_set_vaddr_iomem(&guc->busy.v2.device_map, (void __iomem *)busy_v2_ptr);
+	else
+		iosys_map_set_vaddr(&guc->busy.v2.device_map, busy_v2_ptr);
+
+	return 0;
+}
+
+static void guc_busy_v2_free_device(struct intel_guc *guc)
+{
+	i915_vma_unpin_and_release(&guc->busy.v2.device_vma, I915_VMA_RELEASE_MAP);
+	iosys_map_clear(&guc->busy.v2.device_map);
+
+	guc->busy.v2.device_vma = NULL;
+}
+
+static void __busy_v2_get_engine_usage_record(struct intel_guc *guc,
+					      struct intel_engine_cs *engine,
+					      u64 *_ticks_engine, u64 *_ticks_function,
+					      u64 *_ticks_gt)
+{
+	struct iosys_map rec_map_engine, rec_map_global;
+	u64 ticks_engine, ticks_function, ticks_gt;
+	int i = 0, ret;
+
+	ret = intel_guc_engine_usage_record_map_v2(guc, engine, ~0U,
+						   &rec_map_engine, &rec_map_global);
+	if (ret) {
+		ticks_engine = 0;
+		ticks_function = 0;
+		ticks_gt = 0;
+		goto done;
+	}
+
+#define record_read_engine(map_, field_) \
+	iosys_map_rd_field(map_, 0, struct guc_engine_data, field_)
+#define record_read_global(map_, field_) \
+	iosys_map_rd_field(map_, 0, struct guc_engine_observation_data, field_)
+
+	do {
+		if (engine)
+			ticks_engine = record_read_engine(&rec_map_engine, total_execution_ticks);
+		ticks_function = record_read_global(&rec_map_global, total_active_ticks);
+		ticks_gt = record_read_global(&rec_map_global, gt_timestamp);
+
+		if (engine && (record_read_engine(&rec_map_engine, total_execution_ticks) !=
+			       ticks_engine))
+			continue;
+
+		if (record_read_global(&rec_map_global, total_active_ticks) == ticks_function &&
+		    record_read_global(&rec_map_global, gt_timestamp) == ticks_gt)
+			break;
+	} while (++i < 6);
+
+#undef record_read_engine
+#undef record_read_global
+
+done:
+	if (_ticks_engine)
+		*_ticks_engine = ticks_engine;
+	if (_ticks_function)
+		*_ticks_function = ticks_function;
+	if (_ticks_gt)
+		*_ticks_gt = ticks_gt;
+}
+
+static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	u64 ticks_engine;
+	u64 total;
+
+	__busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, NULL, NULL);
+
+	total = intel_gt_clock_interval_to_ns(gt, ticks_engine);
+
+	return ns_to_ktime(total);
+}
+
+static int busy_v2_guc_action_enable_usage_stats_device(struct intel_guc *guc)
+{
+	u32 offset = guc_engine_usage_offset_v2_device(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static int busy_v2_guc_action_enable_usage_stats_function(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset_pf(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2,
 		offset,
 		0,
 	};
@@ -1472,26 +1637,40 @@ static int guc_init_engine_stats(struct intel_guc *guc)
 	intel_wakeref_t wakeref;
 	int ret;
 
-	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
-		ret = guc_action_enable_usage_stats(guc);
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1)) {
+		with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+			ret = busy_v1_guc_action_enable_usage_stats(guc);
+
+		if (ret == 0)
+			busy_v1_guc_enable_worker(guc);
+	} else {
+		with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+			ret = busy_v2_guc_action_enable_usage_stats_device(guc);
+
+			if (ret == 0)
+				ret = busy_v2_guc_action_enable_usage_stats_function(guc);
+		}
+	}
 
 	if (ret)
 		guc_err(guc, "Failed to enable usage stats: %pe\n", ERR_PTR(ret));
-	else
-		guc_enable_busyness_worker(guc);
 
 	return ret;
 }
 
 static void guc_fini_engine_stats(struct intel_guc *guc)
 {
-	guc_cancel_busyness_worker(guc);
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+		busy_v1_guc_cancel_worker(guc);
 }
 
 void intel_guc_busyness_park(struct intel_gt *gt)
 {
 	struct intel_guc *guc = &gt->uc.guc;
 
+	if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 3, 1))
+		return;
+
 	if (!guc_submission_initialized(guc))
 		return;
 
@@ -1500,19 +1679,19 @@ void intel_guc_busyness_park(struct intel_gt *gt)
 	 * and causes an unclaimed register access warning. Cancel the worker
 	 * synchronously here.
 	 */
-	guc_cancel_busyness_worker(guc);
+	busy_v1_guc_cancel_worker(guc);
 
 	/*
 	 * Before parking, we should sample engine busyness stats if we need to.
 	 * We can skip it if we are less than half a ping from the last time we
 	 * sampled the busyness stats.
 	 */
-	if (guc->timestamp.last_stat_jiffies &&
-	    !time_after(jiffies, guc->timestamp.last_stat_jiffies +
-			(guc->timestamp.ping_delay / 2)))
+	if (guc->busy.v1.last_stat_jiffies &&
+	    !time_after(jiffies, guc->busy.v1.last_stat_jiffies +
+			(guc->busy.v1.ping_delay / 2)))
 		return;
 
-	__update_guc_busyness_stats(guc);
+	__busy_v1_update_guc_busyness_stats(guc);
 }
 
 void intel_guc_busyness_unpark(struct intel_gt *gt)
@@ -1521,13 +1700,16 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 	unsigned long flags;
 	ktime_t unused;
 
+	if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 3, 1))
+		return;
+
 	if (!guc_submission_initialized(guc))
 		return;
 
-	spin_lock_irqsave(&guc->timestamp.lock, flags);
-	guc_update_pm_timestamp(guc, &unused);
-	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
-	guc_enable_busyness_worker(guc);
+	spin_lock_irqsave(&guc->busy.v1.lock, flags);
+	busy_v1_guc_update_pm_timestamp(guc, &unused);
+	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
+	busy_v1_guc_enable_worker(guc);
 }
 
 static inline bool
@@ -1590,7 +1772,9 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
-	__reset_guc_busyness_stats(guc);
+
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+		__busy_v1_reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(guc_to_gt(guc)->irq_lock);
@@ -1922,7 +2106,6 @@ static void reset_fail_worker_func(struct work_struct *w);
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
-	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->submission_initialized)
@@ -1941,12 +2124,23 @@ int intel_guc_submission_init(struct intel_guc *guc)
 		goto destroy_pool;
 	}
 
-	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
-	guc->timestamp.shift = gpm_timestamp_shift(gt);
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1)) {
+		struct intel_gt *gt = guc_to_gt(guc);
+
+		guc->busy.v1.ping_delay = (BUSY_V1_POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+		guc->busy.v1.shift = busy_v1_gpm_timestamp_shift(gt);
+	} else {
+		ret = guc_busy_v2_alloc_device(guc);
+		if (ret)
+			goto destroy_bitmap;
+	}
+
 	guc->submission_initialized = true;
 
 	return 0;
 
+destroy_bitmap:
+	bitmap_free(guc->submission_state.guc_ids_bitmap);
 destroy_pool:
 	guc_lrc_desc_pool_destroy_v69(guc);
 
@@ -1962,6 +2156,8 @@ void intel_guc_submission_fini(struct intel_guc *guc)
 	guc_lrc_desc_pool_destroy_v69(guc);
 	i915_sched_engine_put(guc->sched_engine);
 	bitmap_free(guc->submission_state.guc_ids_bitmap);
+	if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 3, 1))
+		guc_busy_v2_free_device(guc);
 	guc->submission_initialized = false;
 }
 
@@ -2797,7 +2993,10 @@ static void guc_context_unpin(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 
-	__guc_context_update_stats(ce);
+	if (GUC_SUBMIT_VER(ce_to_guc(ce)) < MAKE_GUC_VER(1, 3, 1))
+		__busy_v1_guc_context_update_stats(ce);
+	else
+		__busy_v2_guc_context_update_stats(ce);
 	unpin_guc_id(guc, ce);
 	lrc_unpin(ce);
 
@@ -4257,7 +4456,10 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
-	engine->busyness = guc_engine_busyness;
+	if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1))
+		engine->busyness = busy_v1_guc_engine_busyness;
+	else
+		engine->busyness = busy_v2_guc_engine_busyness;
 
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
@@ -4501,7 +4703,8 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 /* Note: By the time we're here, GuC may have already been reset */
 void intel_guc_submission_disable(struct intel_guc *guc)
 {
-	guc_cancel_busyness_worker(guc);
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+		busy_v1_guc_cancel_worker(guc);
 
 	/* Semaphore interrupt disable and route to host */
 	guc_route_semaphores(guc, false);
@@ -4557,8 +4760,10 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
 	INIT_WORK(&guc->submission_state.reset_fail_worker,
 		  reset_fail_worker_func);
 
-	spin_lock_init(&guc->timestamp.lock);
-	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1)) {
+		spin_lock_init(&guc->busy.v1.lock);
+		INIT_DELAYED_WORK(&guc->busy.v1.work, busy_v1_guc_timestamp_ping);
+	}
 
 	guc->submission_state.sched_disable_delay_ms = SCHED_DISABLE_DELAY_MS;
 	guc->submission_state.num_guc_ids = GUC_MAX_CONTEXT_ID;

From patchwork Fri Sep 22 22:25:09 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: John Harrison <John.C.Harrison@Intel.com>
X-Patchwork-Id: 13396416
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id B89CACE7A88
	for <intel-gfx@archiver.kernel.org>; Fri, 22 Sep 2023 22:26:20 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 5BEF910E71C;
	Fri, 22 Sep 2023 22:26:11 +0000 (UTC)
Received: from mgamail.intel.com (mgamail.intel.com [192.55.52.120])
 by gabe.freedesktop.org (Postfix) with ESMTPS id 8552910E71B;
 Fri, 22 Sep 2023 22:26:08 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
 d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
 t=1695421568; x=1726957568;
 h=from:to:cc:subject:date:message-id:in-reply-to:
 references:mime-version:content-transfer-encoding;
 bh=9uUAC0XfcEmTzJBsWV/boLhyr2VylYigLz3ZDmASssY=;
 b=BOYXCsbol4SVQkpDOrGklBrDMINsQXcPA5SCmLKaqF/EU2Qn7GRC4UQX
 o69CDlp4bKeOTJIWzw7A+WLlaki4T0n8n+zilqstyU9rz9g5AjTgYvl44
 efAthHZFBolQBqh3Ek4CkEzCnVba2X10loUUeJZRnEg8NEThrllxby+pL
 RxdlfC4SA6Q+odgmkdbS8szS11Q8lSNaI2Q6PjbybQVVp3+d5keNOhKkE
 ZMUu8Y9yCRdW3EwGhz1ySqdbqwlrNzTVS5Z8qoouAA4gBthPe9DQ2Hvft
 qlt2oHu4r7tmoEWfW8gDGg7sTy+HU7CEXRr869VSuTfItSpGTKXy+Ojr1 A==;
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="379836617"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="379836617"
Received: from fmsmga005.fm.intel.com ([10.253.24.32])
 by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 22 Sep 2023 15:25:57 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="1078549555"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="1078549555"
Received: from relo-linux-5.jf.intel.com ([10.165.21.152])
 by fmsmga005.fm.intel.com with ESMTP; 22 Sep 2023 15:25:57 -0700
From: John.C.Harrison@Intel.com
To: Intel-GFX@Lists.FreeDesktop.Org
Date: Fri, 22 Sep 2023 15:25:09 -0700
Message-ID: <20230922222510.2235213-3-John.C.Harrison@Intel.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
References: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
MIME-Version: 1.0
Organization: Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way,
 Swindon SN3 1RJ
Subject: [Intel-gfx] [PATCH 2/3] drm/i915/mtl: Add a PMU counter for total
 active ticks
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>

Current engine busyness interface exposed by GuC has a few issues:

- The busyness of active engine is calculated using 2 values provided by
  GuC and is prone to race between CPU reading those values and GuC
  updating them. Any sort of HW synchronization would be at the cost of
  scheduling latencies.

- GuC provides only 32 bit values for busyness and KMD has to run a
  worker to extend the values to 64 bit. In addition KMD also needs to
  extend the GT timestamp to 64 bits so that it can be used to calculate
  active busyness for an engine.

To address these issues, GuC provides a new interface to calculate
engine busyness. GuC accumulates the busyness ticks in a 64 bit value
and also internally updates the busyness for an active context using a
periodic timer. This simplifies the KMD implementation such that KMD
only needs to relay the busyness value to the user.

In addition to fixing the interface, GuC also provides a periodically
total active ticks that the GT has been running for. This counter is
exposed to the user so that the % busyness can be calculated as follows:

busyness % = (engine active ticks/total active ticks) * 100.

Implement the new interface and start by adding a new counter for total
active ticks.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 24 +++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  1 +
 drivers/gpu/drm/i915/i915_pmu.c               |  6 +++++
 include/uapi/drm/i915_drm.h                   |  2 ++
 4 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 88465d701c278..0c1fee5360777 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1607,6 +1607,30 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 	return ns_to_ktime(total);
 }
 
+static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc)
+{
+	return guc->busy.v1.gt_stamp;
+}
+
+static u64 busy_v2_intel_guc_total_active_ticks(struct intel_guc *guc)
+{
+	u64 ticks_gt;
+
+	__busy_v2_get_engine_usage_record(guc, NULL, NULL, NULL, &ticks_gt);
+
+	return ticks_gt;
+}
+
+u64 intel_guc_total_active_ticks(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+		return busy_v1_intel_guc_total_active_ticks(guc);
+	else
+		return busy_v2_intel_guc_total_active_ticks(guc);
+}
+
 static int busy_v2_guc_action_enable_usage_stats_device(struct intel_guc *guc)
 {
 	u32 offset = guc_engine_usage_offset_v2_device(guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c57b29cdb1a64..f6d42838825f2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -30,6 +30,7 @@ void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct drm_printer *m);
 void intel_guc_busyness_park(struct intel_gt *gt);
 void intel_guc_busyness_unpark(struct intel_gt *gt);
+u64 intel_guc_total_active_ticks(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index d35973b411863..4f52636eb4a80 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -563,6 +563,8 @@ config_status(struct drm_i915_private *i915, u64 config)
 		break;
 	case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
 		break;
+	case I915_PMU_TOTAL_ACTIVE_TICKS:
+		break;
 	default:
 		return -ENOENT;
 	}
@@ -678,6 +680,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 		case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
 			val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
 			break;
+		case I915_PMU_TOTAL_ACTIVE_TICKS:
+			val = intel_guc_total_active_ticks(i915->gt[gt_id]);
+			break;
 		}
 	}
 
@@ -986,6 +991,7 @@ create_event_attributes(struct i915_pmu *pmu)
 		__global_event(2, "interrupts", NULL),
 		__event(3, "rc6-residency", "ns"),
 		__event(4, "software-gt-awake-time", "ns"),
+		__event(5, "total-active-ticks", NULL),
 	};
 	static const struct {
 		enum drm_i915_pmu_engine_sample sample;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7000e5910a1d7..e26dd27ff4a5f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -296,6 +296,7 @@ enum drm_i915_pmu_engine_sample {
 #define I915_PMU_INTERRUPTS		__I915_PMU_OTHER(2)
 #define I915_PMU_RC6_RESIDENCY		__I915_PMU_OTHER(3)
 #define I915_PMU_SOFTWARE_GT_AWAKE_TIME	__I915_PMU_OTHER(4)
+#define I915_PMU_TOTAL_ACTIVE_TICKS	__I915_PMU_OTHER(5)
 
 #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
 
@@ -304,6 +305,7 @@ enum drm_i915_pmu_engine_sample {
 #define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
 #define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
 #define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
+#define __I915_PMU_TOTAL_ACTIVE_TICKS(gt)	___I915_PMU_OTHER(gt, 5)
 
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */

From patchwork Fri Sep 22 22:25:10 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: John Harrison <John.C.Harrison@Intel.com>
X-Patchwork-Id: 13396417
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id BD3DBCE7A81
	for <intel-gfx@archiver.kernel.org>; Fri, 22 Sep 2023 22:26:21 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 1BB1E10E71D;
	Fri, 22 Sep 2023 22:26:12 +0000 (UTC)
Received: from mgamail.intel.com (mgamail.intel.com [192.55.52.120])
 by gabe.freedesktop.org (Postfix) with ESMTPS id 413BF10E19F;
 Fri, 22 Sep 2023 22:26:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
 d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
 t=1695421569; x=1726957569;
 h=from:to:cc:subject:date:message-id:in-reply-to:
 references:mime-version:content-transfer-encoding;
 bh=Iqhw/1Of57XlksORR0OgR1anmPoLPnmeUUJ27PoM/B4=;
 b=FkVfr7G8LGqA+QJyCwAurxEXz0B/Fs/hCiNDhGTY/+5aEnxFGbiBSYzI
 zfO2t/XE9wiRcNeDvlQd5l/YgD+zQHQdSHon5H3B4iHBgLWQkbYbun6Ms
 HgQjpV91aODmI3a1/owfxIJcMaobc+G9RbXc86OGHBa55JOe9tkTSPLAf
 PH/lI9h0viERkpgNXQx0yjiYwhDgKWRDW/X0821/i+nCcdxFlIthxsOWv
 iZvpRK24GRQFqaHkZrt1ob/RrdtPA2AVrADqP66mwcl9DVhmoHkDwQtkq
 LchhEPSZV/I/9c3azhZs26I1Bp122+juJ4GIz4XsDQnxd7Y41bhi9VU38 g==;
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="379836618"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="379836618"
Received: from fmsmga005.fm.intel.com ([10.253.24.32])
 by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 22 Sep 2023 15:25:57 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10841"; a="1078549558"
X-IronPort-AV: E=Sophos;i="6.03,169,1694761200"; d="scan'208";a="1078549558"
Received: from relo-linux-5.jf.intel.com ([10.165.21.152])
 by fmsmga005.fm.intel.com with ESMTP; 22 Sep 2023 15:25:57 -0700
From: John.C.Harrison@Intel.com
To: Intel-GFX@Lists.FreeDesktop.Org
Date: Fri, 22 Sep 2023 15:25:10 -0700
Message-ID: <20230922222510.2235213-4-John.C.Harrison@Intel.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
References: <20230922222510.2235213-1-John.C.Harrison@Intel.com>
MIME-Version: 1.0
Organization: Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way,
 Swindon SN3 1RJ
Subject: [Intel-gfx] [PATCH 3/3] drm/i915/mtl: Add counters for engine
 busyness ticks
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>

In new version of GuC engine busyness, GuC provides engine busyness
ticks as a 64 bit counter. Add a new counter to relay this value to the
user as is.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine.h        |  1 +
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     | 16 +++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h  | 12 ++++
 drivers/gpu/drm/i915/gt/intel_engine_user.c   |  1 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++++++++++++++-----
 drivers/gpu/drm/i915/i915_pmu.c               | 25 ++++++-
 drivers/gpu/drm/i915/i915_pmu.h               |  2 +-
 include/uapi/drm/i915_drm.h                   | 13 +++-
 8 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index b58c30ac8ef02..57af7ec8ecd82 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head *requests,
 
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
 				   ktime_t *now);
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine);
 
 void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
 				  struct intel_context **ce, struct i915_request **rq);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 84a75c95f3f7d..1c9ffb1ae9889 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 	return engine->busyness(engine, now);
 }
 
+/**
+ * intel_engine_get_busy_ticks() - Return current accumulated engine busyness
+ * ticks
+ * @engine: engine to report on
+ *
+ * Returns accumulated ticks @engine was busy since engine stats were enabled.
+ */
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine)
+{
+	if (!engine->busyness_ticks ||
+	    !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS))
+		return 0;
+
+	return engine->busyness_ticks(engine);
+}
+
 struct intel_context *
 intel_engine_create_virtual(struct intel_engine_cs **siblings,
 			    unsigned int count, unsigned long flags)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 40fd8f984d64b..a88d40c74d604 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -548,6 +548,11 @@ struct intel_engine_cs {
 	ktime_t		(*busyness)(struct intel_engine_cs *engine,
 				    ktime_t *now);
 
+	/*
+	 * Get engine busyness ticks
+	 */
+	u64		(*busyness_ticks)(struct intel_engine_cs *engine);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -574,6 +579,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
 #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
 #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
+#define I915_ENGINE_SUPPORTS_TICKS_STATS   BIT(13)
 	unsigned int flags;
 
 	/*
@@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_SUPPORTS_STATS;
 }
 
+static inline bool
+intel_engine_supports_tick_stats(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS;
+}
+
 static inline bool
 intel_engine_has_preemption(const struct intel_engine_cs *engine)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index dcedff41a825f..69eb610b5ab0a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private *i915)
 		MAP(HAS_PREEMPTION, PREEMPTION),
 		MAP(HAS_SEMAPHORES, SEMAPHORES),
 		MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS),
+		MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS),
 #undef MAP
 	};
 	struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0c1fee5360777..71749fb9ad35b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)
 	guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
 }
 
-/*
- * Unlike the execlist mode of submission total and active times are in terms of
- * gt clocks. The *now parameter is retained to return the cpu time at which the
- * busyness was sampled.
- */
-static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+static u64 __busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine, ktime_t *now_out)
 {
 	struct intel_engine_guc_stats_v1 stats_saved, *stats = &engine->stats.guc_v1;
 	struct i915_gpu_error *gpu_error = &engine->i915->gpu_error;
@@ -1304,6 +1299,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 	unsigned long flags;
 	u32 reset_count;
 	bool in_reset;
+	ktime_t now;
 
 	spin_lock_irqsave(&guc->busy.v1.lock, flags);
 
@@ -1318,7 +1314,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 	reset_count = i915_reset_count(gpu_error);
 	in_reset = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
 
-	*now = ktime_get();
+	now = ktime_get();
 
 	/*
 	 * The active busyness depends on start_gt_clk and gt_stamp.
@@ -1334,7 +1330,7 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 		 * start_gt_clk' calculation below for active engines.
 		 */
 		busy_v1_guc_update_engine_gt_clks(engine);
-		busy_v1_guc_update_pm_timestamp(guc, now);
+		busy_v1_guc_update_pm_timestamp(guc, &now);
 		intel_gt_pm_put_async(gt);
 		if (i915_reset_count(gpu_error) != reset_count) {
 			*stats = stats_saved;
@@ -1342,16 +1338,37 @@ static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 		}
 	}
 
-	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	total = stats->total_gt_clks;
 	if (stats->running) {
 		u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk;
 
-		total += intel_gt_clock_interval_to_ns(gt, clk);
+		total += clk;
 	}
 
 	spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
 
-	return ns_to_ktime(total);
+	if (now_out)
+		*now_out = now;
+
+	return total;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now);
+	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks);
+
+	return ns_to_ktime(ns);
+}
+
+static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs *engine)
+{
+	return __busy_v1_guc_engine_busyness_ticks(engine, NULL);
 }
 
 static void busy_v1_guc_enable_worker(struct intel_guc *guc)
@@ -1607,6 +1624,16 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime
 	return ns_to_ktime(total);
 }
 
+static u64 busy_v2_guc_engine_busyness_ticks(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u64 ticks_engine;
+
+	__busy_v2_get_engine_usage_record(guc, engine, &ticks_engine, NULL, NULL);
+
+	return ticks_engine;
+}
+
 static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc)
 {
 	return guc->busy.v1.gt_stamp;
@@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
-	if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1))
-		engine->busyness = busy_v1_guc_engine_busyness;
-	else
-		engine->busyness = busy_v2_guc_engine_busyness;
+	if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) {
+		if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))
+			engine->busyness = busy_v1_guc_engine_busyness;
+		engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks;
+	} else {
+		if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))
+			engine->busyness = busy_v2_guc_engine_busyness;
+		engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks;
+	}
+
+	if (engine->busyness)
+		engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 
-	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
+	engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 4f52636eb4a80..1b859556644f6 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -26,7 +26,8 @@
 #define ENGINE_SAMPLE_MASK \
 	(BIT(I915_SAMPLE_BUSY) | \
 	 BIT(I915_SAMPLE_WAIT) | \
-	 BIT(I915_SAMPLE_SEMA))
+	 BIT(I915_SAMPLE_SEMA) | \
+	 BIT(I915_SAMPLE_BUSY_TICKS))
 
 static cpumask_t i915_pmu_cpumask;
 static unsigned int i915_pmu_target_cpu = -1;
@@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu)
 	 * Also there is software busyness tracking available we do not
 	 * need the timer for I915_SAMPLE_BUSY counter.
 	 */
-	if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS)
+	if ((i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) ||
+	    (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) {
 		enable &= ~BIT(I915_SAMPLE_BUSY);
+		enable &= ~BIT(I915_SAMPLE_BUSY_TICKS);
+	}
 
 	/*
 	 * If some bits remain it means we need the sampling timer running.
@@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs *engine, unsigned int period_ns
 		add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns);
 
 	/* No need to sample when busy stats are supported. */
-	if (intel_engine_supports_stats(engine))
+	if (intel_engine_supports_stats(engine) ||
+	    intel_engine_supports_tick_stats(engine))
 		return;
 
 	/*
@@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine,
 {
 	switch (sample) {
 	case I915_SAMPLE_BUSY:
+		if (!intel_engine_supports_stats(engine))
+			return -ENODEV;
+		break;
+	case I915_SAMPLE_BUSY_TICKS:
+		if (!intel_engine_supports_tick_stats(engine))
+			return -ENODEV;
+		break;
 	case I915_SAMPLE_WAIT:
 		break;
 	case I915_SAMPLE_SEMA:
@@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 
 			val = ktime_to_ns(intel_engine_get_busy_time(engine,
 								     &unused));
+		} else if (sample == I915_SAMPLE_BUSY_TICKS &&
+			   intel_engine_supports_tick_stats(engine)) {
+			val = intel_engine_get_busy_ticks(engine);
 		} else {
 			val = engine->pmu.sample[sample].cur;
 		}
@@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu)
 		__engine_event(I915_SAMPLE_BUSY, "busy"),
 		__engine_event(I915_SAMPLE_SEMA, "sema"),
 		__engine_event(I915_SAMPLE_WAIT, "wait"),
+		__engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"),
 	};
 	unsigned int count = 0;
 	struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu)
 								engine->uabi_instance,
 								engine_events[i].sample));
 
+			if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS)
+				continue;
+
 			str = kasprintf(GFP_KERNEL, "%s-%s.unit",
 					engine->name, engine_events[i].name);
 			if (!str)
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index 41af038c37388..72a9c71944f8d 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -49,7 +49,7 @@ enum {
 	(I915_ENGINE_SAMPLE_COUNT + \
 	 I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT)
 
-#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)
+#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1)
 
 struct i915_pmu_sample {
 	u64 cur;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e26dd27ff4a5f..8ae98c1bda0ea 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -257,7 +257,8 @@ struct i915_engine_class_instance {
 enum drm_i915_pmu_engine_sample {
 	I915_SAMPLE_BUSY = 0,
 	I915_SAMPLE_WAIT = 1,
-	I915_SAMPLE_SEMA = 2
+	I915_SAMPLE_SEMA = 2,
+	I915_SAMPLE_BUSY_TICKS = 3,
 };
 
 #define I915_PMU_SAMPLE_BITS (4)
@@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample {
 #define I915_PMU_ENGINE_BUSY(class, instance) \
 	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
 
+#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS)
+
 #define I915_PMU_ENGINE_WAIT(class, instance) \
 	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
 
@@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait {
 #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
 #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
 #define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
+/*
+ * BUSY_STATS is deprecated on platforms with GuC based submission and will nt
+ * be available at all on newer platforms. It has accuracy issues due to the
+ * conversions from tick counts to wall time.
+ * BUSY_TICKS_STATS should be used instead.
+ */
 #define   I915_SCHEDULER_CAP_ENGINE_BUSY_STATS	(1ul << 4)
+#define   I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS	(1ul << 5)
 /*
  * Indicates the 2k user priority levels are statically mapped into 3 buckets as
  * follows: