diff mbox series

[v2,9/9] drm/i915/perf: Add support for OA media units

Message ID 20230217005850.2511422-10-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series Add OAM support for MTL | expand

Commit Message

Umesh Nerlige Ramappa Feb. 17, 2023, 12:58 a.m. UTC
MTL introduces additional OA units dedicated to media use cases. Add
support for programming these OA units by passing the media engine class
and instance parameters.

UMD specific changes for GPUvis support:
https://patchwork.freedesktop.org/patch/522827/?series=114023
https://patchwork.freedesktop.org/patch/522822/?series=114023
https://patchwork.freedesktop.org/patch/522826/?series=114023
https://patchwork.freedesktop.org/patch/522828/?series=114023
https://patchwork.freedesktop.org/patch/522816/?series=114023
https://patchwork.freedesktop.org/patch/522825/?series=114023

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h          |   2 +
 drivers/gpu/drm/i915/i915_pci.c          |   1 +
 drivers/gpu/drm/i915/i915_perf.c         | 247 ++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++
 drivers/gpu/drm/i915/i915_perf_types.h   |  40 ++++
 drivers/gpu/drm/i915/intel_device_info.h |   1 +
 include/uapi/drm/i915_drm.h              |   4 +
 7 files changed, 347 insertions(+), 26 deletions(-)

Comments

Umesh Nerlige Ramappa Feb. 17, 2023, 11:37 p.m. UTC | #1
On Thu, Feb 16, 2023 at 04:58:50PM -0800, Umesh Nerlige Ramappa wrote:
>MTL introduces additional OA units dedicated to media use cases. Add
>support for programming these OA units by passing the media engine class
>and instance parameters.
>
>UMD specific changes for GPUvis support:
>https://patchwork.freedesktop.org/patch/522827/?series=114023
>https://patchwork.freedesktop.org/patch/522822/?series=114023
>https://patchwork.freedesktop.org/patch/522826/?series=114023
>https://patchwork.freedesktop.org/patch/522828/?series=114023
>https://patchwork.freedesktop.org/patch/522816/?series=114023
>https://patchwork.freedesktop.org/patch/522825/?series=114023

GPUvis PR is here - https://github.com/mikesart/gpuvis/pull/81

Regards,
Umesh

>
>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>---
> drivers/gpu/drm/i915/i915_drv.h          |   2 +
> drivers/gpu/drm/i915/i915_pci.c          |   1 +
> drivers/gpu/drm/i915/i915_perf.c         | 247 ++++++++++++++++++++---
> drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++
> drivers/gpu/drm/i915/i915_perf_types.h   |  40 ++++
> drivers/gpu/drm/i915/intel_device_info.h |   1 +
> include/uapi/drm/i915_drm.h              |   4 +
> 7 files changed, 347 insertions(+), 26 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>index 0393273faa09..f3cacbf41c86 100644
>--- a/drivers/gpu/drm/i915/i915_drv.h
>+++ b/drivers/gpu/drm/i915/i915_drv.h
>@@ -856,6 +856,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
> 	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
> #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
> 	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
>+#define HAS_OAM(dev_priv) \
>+	(INTEL_INFO(dev_priv)->has_oam)
>
> /*
>  * Set this flag, when platform requires 64K GTT page sizes or larger for
>diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>index a8d942b16223..621730b6551c 100644
>--- a/drivers/gpu/drm/i915/i915_pci.c
>+++ b/drivers/gpu/drm/i915/i915_pci.c
>@@ -1028,6 +1028,7 @@ static const struct intel_device_info adl_p_info = {
> 	.has_mslice_steering = 1, \
> 	.has_oa_bpc_reporting = 1, \
> 	.has_oa_slice_contrib_limits = 1, \
>+	.has_oam = 1, \
> 	.has_rc6 = 1, \
> 	.has_reset_engine = 1, \
> 	.has_rps = 1, \
>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>index f028df812067..a57690f4c531 100644
>--- a/drivers/gpu/drm/i915/i915_perf.c
>+++ b/drivers/gpu/drm/i915/i915_perf.c
>@@ -192,6 +192,7 @@
>  */
>
> #include <linux/anon_inodes.h>
>+#include <linux/nospec.h>
> #include <linux/sizes.h>
> #include <linux/uuid.h>
>
>@@ -326,6 +327,13 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
> 	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
> 	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
> 	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
>+	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
>+	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
>+};
>+
>+/* PERF_GROUP_OAG is unused for oa_base, drop it for mtl */
>+static const u32 mtl_oa_base[] = {
>+	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
> };
>
> #define SAMPLE_OA_REPORT      (1<<0)
>@@ -418,11 +426,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
> 	kfree(oa_bo);
> }
>
>+static inline const
>+struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
>+{
>+	return &stream->oa_buffer.group->regs;
>+}
>+
> static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
> {
> 	struct intel_uncore *uncore = stream->uncore;
>
>-	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
>+	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
> 	       GEN12_OAG_OATAILPTR_MASK;
> }
>
>@@ -886,7 +900,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> 		i915_reg_t oaheadptr;
>
> 		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?
>-			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
>+			    __oa_regs(stream)->oa_head_ptr :
>+			    GEN8_OAHEADPTR;
>
> 		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>
>@@ -939,7 +954,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
> 		return -EIO;
>
> 	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?
>-		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
>+		       __oa_regs(stream)->oa_status :
>+		       GEN8_OASTATUS;
>
> 	oastatus = intel_uncore_read(uncore, oastatus_reg);
>
>@@ -1643,16 +1659,46 @@ free_noa_wait(struct i915_perf_stream *stream)
> 	i915_vma_unpin_and_release(&stream->noa_wait, 0);
> }
>
>+/*
>+ * intel_engine_lookup_user ensures that most of engine specific checks are
>+ * taken care of, however, we can run into a case where the OA unit catering to
>+ * the engine passed by the user is disabled for some reason. In such cases,
>+ * ensure oa unit corresponding to an engine is functional. If there are no
>+ * engines in the group, the unit is disabled.
>+ */
>+static bool oa_unit_functional(const struct intel_engine_cs *engine)
>+{
>+	return engine->oa_group && engine->oa_group->num_engines;
>+}
>+
> static bool engine_supports_oa(const struct intel_engine_cs *engine)
> {
> 	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
>
> 	switch (platform) {
>+	case INTEL_METEORLAKE:
>+		return engine->class == RENDER_CLASS ||
>+		       ((engine->class == VIDEO_DECODE_CLASS ||
>+			 engine->class == VIDEO_ENHANCEMENT_CLASS) &&
>+			engine->gt->type == GT_MEDIA);
> 	default:
> 		return engine->class == RENDER_CLASS;
> 	}
> }
>
>+static bool engine_class_supports_oa_format(struct intel_engine_cs *engine, int type)
>+{
>+	switch (engine->class) {
>+	case RENDER_CLASS:
>+		return type == TYPE_OAG;
>+	case VIDEO_DECODE_CLASS:
>+	case VIDEO_ENHANCEMENT_CLASS:
>+		return type == TYPE_OAM;
>+	default:
>+		return false;
>+	}
>+}
>+
> static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> {
> 	struct i915_perf *perf = stream->perf;
>@@ -1680,7 +1726,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> 		drm_WARN_ON(&gt->i915->drm,
> 			    intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc));
>
>-	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
>+	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
> 	intel_engine_pm_put(stream->engine);
>
> 	if (stream->ctx)
>@@ -1804,8 +1850,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>
> 	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>
>-	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
>-	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
> 			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
> 	stream->oa_buffer.head = gtt_offset;
>
>@@ -1817,9 +1863,9 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> 	 *  to enable proper functionality of the overflow
> 	 *  bit."
> 	 */
>-	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
> 			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>-	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
> 			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>
> 	/* Mark that we need updated tail pointers to read from... */
>@@ -2579,7 +2625,8 @@ gen8_modify_self(struct intel_context *ce,
> 	return err;
> }
>
>-static int gen8_configure_context(struct i915_gem_context *ctx,
>+static int gen8_configure_context(struct i915_perf_stream *stream,
>+				  struct i915_gem_context *ctx,
> 				  struct flex *flex, unsigned int count)
> {
> 	struct i915_gem_engines_iter it;
>@@ -2589,7 +2636,8 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
> 	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
> 		GEM_BUG_ON(ce == ce->engine->kernel_context);
>
>-		if (!engine_supports_oa(ce->engine))
>+		if (!engine_supports_oa(ce->engine) ||
>+		    ce->engine->class != stream->engine->class)
> 			continue;
>
> 		/* Otherwise OA settings will be set upon first use */
>@@ -2720,7 +2768,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
>
> 		spin_unlock(&i915->gem.contexts.lock);
>
>-		err = gen8_configure_context(ctx, regs, num_regs);
>+		err = gen8_configure_context(stream, ctx, regs, num_regs);
> 		if (err) {
> 			i915_gem_context_put(ctx);
> 			return err;
>@@ -2740,7 +2788,8 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
> 	for_each_uabi_engine(engine, i915) {
> 		struct intel_context *ce = engine->kernel_context;
>
>-		if (!engine_supports_oa(ce->engine))
>+		if (!engine_supports_oa(ce->engine) ||
>+		    ce->engine->class != stream->engine->class)
> 			continue;
>
> 		regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu);
>@@ -2765,6 +2814,9 @@ gen12_configure_all_contexts(struct i915_perf_stream *stream,
> 		},
> 	};
>
>+	if (stream->engine->class != RENDER_CLASS)
>+		return 0;
>+
> 	return oa_configure_all_contexts(stream,
> 					 regs, ARRAY_SIZE(regs),
> 					 active);
>@@ -2894,7 +2946,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> 				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
> 	}
>
>-	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
> 			   /* Disable clk ratio reports, like previous Gens. */
> 			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
> 					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
>@@ -2904,7 +2956,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> 			    */
> 			   oag_report_ctx_switches(stream));
>
>-	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
> 			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
> 			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
> 			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
>@@ -3058,8 +3110,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
>
> static void gen12_oa_enable(struct i915_perf_stream *stream)
> {
>-	struct intel_uncore *uncore = stream->uncore;
>-	u32 report_format = stream->oa_buffer.format->format;
>+	const struct i915_perf_regs *regs;
>+	u32 val;
>
> 	/*
> 	 * If we don't want OA reports from the OA buffer, then we don't even
>@@ -3070,9 +3122,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream)
>
> 	gen12_init_oa_buffer(stream);
>
>-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
>-			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
>-			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
>+	regs = __oa_regs(stream);
>+	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
>+	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
>+
>+	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
> }
>
> /**
>@@ -3124,9 +3178,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream)
> {
> 	struct intel_uncore *uncore = stream->uncore;
>
>-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
>+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
> 	if (intel_wait_for_register(uncore,
>-				    GEN12_OAG_OACONTROL,
>+				    __oa_regs(stream)->oa_ctrl,
> 				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
> 				    50))
> 		drm_err(&stream->perf->i915->drm,
>@@ -3329,6 +3383,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>
> 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
>
>+	stream->oa_buffer.group = g;
> 	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
> 	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0))
> 		return -EINVAL;
>@@ -3379,7 +3434,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> 	 *   references will effectively disable RC6.
> 	 */
> 	intel_engine_pm_get(stream->engine);
>-	intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
>+	intel_uncore_forcewake_get(stream->uncore, g->fw_domains);
>
> 	/*
> 	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
>@@ -3440,7 +3495,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> 		intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc);
>
> err_gucrc:
>-	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
>+	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
> 	intel_engine_pm_put(stream->engine);
>
> 	free_oa_configs(stream);
>@@ -4033,6 +4088,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 				    struct perf_open_properties *props)
> {
> 	struct drm_i915_gem_context_param_sseu user_sseu;
>+	const struct i915_oa_format *f;
> 	u64 __user *uprop = uprops;
> 	bool config_sseu = false;
> 	u8 class, instance;
>@@ -4203,6 +4259,17 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 	if (!engine_supports_oa(props->engine))
> 		return -EINVAL;
>
>+	if (!oa_unit_functional(props->engine))
>+		return -ENODEV;
>+
>+	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
>+	f = &perf->oa_formats[i];
>+	if (!engine_class_supports_oa_format(props->engine, f->type)) {
>+		DRM_DEBUG("Invalid OA format %d for class %d\n",
>+			  f->type, props->engine->class);
>+		return -EINVAL;
>+	}
>+
> 	if (config_sseu) {
> 		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
> 		if (ret) {
>@@ -4383,6 +4450,14 @@ static const struct i915_range gen12_oa_b_counters[] = {
> 	{}
> };
>
>+static const struct i915_range mtl_oam_b_counters[] = {
>+	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
>+	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
>+	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
>+	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
>+	{}
>+};
>+
> static const struct i915_range xehp_oa_b_counters[] = {
> 	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
> 	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
>@@ -4429,13 +4504,16 @@ static const struct i915_range gen12_oa_mux_regs[] = {
>
> /*
>  * Ref: 14010536224:
>- * 0x20cc is repurposed on MTL, so use a separate array for MTL.
>+ * 0x20cc is repurposed on MTL, so use a separate array for MTL. Also add the
>+ * MPES/MPEC registers.
>  */
> static const struct i915_range mtl_oa_mux_regs[] = {
> 	{ .start = 0x0d00, .end = 0x0d04 },	/* RPM_CONFIG[0-1] */
> 	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
> 	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
> 	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
>+	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
>+	{}
> };
>
> static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>@@ -4473,10 +4551,26 @@ static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> 	return reg_in_range_table(addr, gen12_oa_b_counters);
> }
>
>+static bool xehp_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
>+{
>+	enum intel_platform platform = INTEL_INFO(perf->i915)->platform;
>+
>+	if (!HAS_OAM(perf->i915))
>+		return false;
>+
>+	switch (platform) {
>+	case INTEL_METEORLAKE:
>+		return reg_in_range_table(addr, mtl_oam_b_counters);
>+	default:
>+		return false;
>+	}
>+}
>+
> static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> {
> 	return reg_in_range_table(addr, xehp_oa_b_counters) ||
>-		reg_in_range_table(addr, gen12_oa_b_counters);
>+		reg_in_range_table(addr, gen12_oa_b_counters) ||
>+		xehp_is_valid_oam_b_counter_addr(perf, addr);
> }
>
> static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
>@@ -4846,11 +4940,39 @@ static u32 __num_perf_groups_per_gt(struct intel_gt *gt)
> 	enum intel_platform platform = INTEL_INFO(gt->i915)->platform;
>
> 	switch (platform) {
>+	case INTEL_METEORLAKE:
>+		return 1;
> 	default:
> 		return 1;
> 	}
> }
>
>+static u32 __oam_engine_group(struct intel_engine_cs *engine)
>+{
>+	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
>+	struct intel_gt *gt = engine->gt;
>+	u32 group = PERF_GROUP_INVALID;
>+
>+	switch (platform) {
>+	case INTEL_METEORLAKE:
>+		/*
>+		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
>+		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
>+		 */
>+		drm_WARN_ON(&engine->i915->drm,
>+			    engine->gt->type != GT_MEDIA);
>+
>+		group = PERF_GROUP_OAM_SAMEDIA_0;
>+		break;
>+	default:
>+		break;
>+	}
>+
>+	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));
>+
>+	return group;
>+}
>+
> static u32 __oa_engine_group(struct intel_engine_cs *engine)
> {
> 	if (!engine_supports_oa(engine))
>@@ -4860,11 +4982,58 @@ static u32 __oa_engine_group(struct intel_engine_cs *engine)
> 	case RENDER_CLASS:
> 		return PERF_GROUP_OAG;
>
>+	case VIDEO_DECODE_CLASS:
>+	case VIDEO_ENHANCEMENT_CLASS:
>+		return __oam_engine_group(engine);
>+
> 	default:
> 		return PERF_GROUP_INVALID;
> 	}
> }
>
>+static struct i915_perf_regs __oam_regs(u32 base)
>+{
>+	return (struct i915_perf_regs) {
>+		base,
>+		GEN12_OAM_HEAD_POINTER(base),
>+		GEN12_OAM_TAIL_POINTER(base),
>+		GEN12_OAM_BUFFER(base),
>+		GEN12_OAM_CONTEXT_CONTROL(base),
>+		GEN12_OAM_CONTROL(base),
>+		GEN12_OAM_DEBUG(base),
>+		GEN12_OAM_STATUS(base),
>+		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
>+	};
>+}
>+
>+static struct i915_perf_regs __oag_regs(void)
>+{
>+	return (struct i915_perf_regs) {
>+		0,
>+		GEN12_OAG_OAHEADPTR,
>+		GEN12_OAG_OATAILPTR,
>+		GEN12_OAG_OABUFFER,
>+		GEN12_OAG_OAGLBCTXCTRL,
>+		GEN12_OAG_OACONTROL,
>+		GEN12_OAG_OA_DEBUG,
>+		GEN12_OAG_OASTATUS,
>+		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
>+	};
>+}
>+
>+static void oa_init_regs(struct intel_gt *gt, u32 id)
>+{
>+	struct i915_perf_group *group = &gt->perf.group[id];
>+	struct i915_perf_regs *regs = &group->regs;
>+
>+	if (id == PERF_GROUP_OAG && gt->type != GT_MEDIA)
>+		*regs = __oag_regs();
>+	else if (IS_METEORLAKE(gt->i915))
>+		*regs = __oam_regs(mtl_oa_base[id]);
>+	else
>+		drm_WARN(&gt->i915->drm, 1, "Unsupported platform for OA\n");
>+}
>+
> static void oa_init_groups(struct intel_gt *gt)
> {
> 	int i, num_groups = gt->perf.num_perf_groups;
>@@ -4881,6 +5050,24 @@ static void oa_init_groups(struct intel_gt *gt)
> 		g->oa_unit_id = perf->oa_unit_ids++;
>
> 		g->gt = gt;
>+		oa_init_regs(gt, i);
>+		g->fw_domains = FORCEWAKE_ALL;
>+		if (i == PERF_GROUP_OAG) {
>+			g->type = TYPE_OAG;
>+
>+			/*
>+			 * Enabling all fw domains for OAG caps the max GT
>+			 * frequency to media FF max. This could be less than
>+			 * what the user sets through the sysfs and perf
>+			 * measurements could be skewed. Since some platforms
>+			 * have separate OAM units to measure media perf, do not
>+			 * enable media fw domains for OAG.
>+			 */
>+			if (HAS_OAM(gt->i915))
>+				g->fw_domains = FORCEWAKE_GT | FORCEWAKE_RENDER;
>+		} else {
>+			g->type = TYPE_OAM;
>+		}
> 	}
> }
>
>@@ -4970,9 +5157,15 @@ static void oa_init_supported_formats(struct i915_perf *perf)
> 		break;
>
> 	case INTEL_DG2:
>+		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
>+		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
>+		break;
>+
> 	case INTEL_METEORLAKE:
> 		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
> 		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
>+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u64_B8_C8);
>+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u32_B8_C8);
> 		break;
>
> 	default:
>@@ -5217,8 +5410,10 @@ int i915_perf_ioctl_version(void)
> 	 *
> 	 * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
> 	 *    DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
>+	 *
>+	 * 7: Add support for video decode and enhancement classes.
> 	 */
>-	return 6;
>+	return 7;
> }
>
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>index 381d94101610..ba103875e19f 100644
>--- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>+++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>@@ -138,4 +138,82 @@
> #define   GEN12_SQCNT1_PMON_ENABLE		REG_BIT(30)
> #define   GEN12_SQCNT1_OABPC			REG_BIT(29)
>
>+/* Gen12 OAM unit */
>+#define GEN12_OAM_HEAD_POINTER_OFFSET   (0x1a0)
>+#define  GEN12_OAM_HEAD_POINTER_MASK    0xffffffc0
>+
>+#define GEN12_OAM_TAIL_POINTER_OFFSET   (0x1a4)
>+#define  GEN12_OAM_TAIL_POINTER_MASK    0xffffffc0
>+
>+#define GEN12_OAM_BUFFER_OFFSET         (0x1a8)
>+#define  GEN12_OAM_BUFFER_SIZE_MASK     (0x7)
>+#define  GEN12_OAM_BUFFER_SIZE_SHIFT    (3)
>+#define  GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */
>+
>+#define GEN12_OAM_CONTEXT_CONTROL_OFFSET              (0x1bc)
>+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2
>+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE       REG_BIT(1)
>+#define  GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME     REG_BIT(0)
>+
>+#define GEN12_OAM_CONTROL_OFFSET                (0x194)
>+#define  GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1
>+#define  GEN12_OAM_CONTROL_COUNTER_ENABLE       REG_BIT(0)
>+
>+#define GEN12_OAM_DEBUG_OFFSET                      (0x198)
>+#define  GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT         REG_BIT(12)
>+#define  GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO          REG_BIT(6)
>+#define  GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS  REG_BIT(5)
>+#define  GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS     REG_BIT(2)
>+#define  GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1)
>+
>+#define GEN12_OAM_STATUS_OFFSET            (0x19c)
>+#define  GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2)
>+#define  GEN12_OAM_STATUS_BUFFER_OVERFLOW  REG_BIT(1)
>+#define  GEN12_OAM_STATUS_REPORT_LOST      REG_BIT(0)
>+
>+#define GEN12_OAM_MMIO_TRG_OFFSET	(0x1d0)
>+
>+#define GEN12_OAM_MMIO_TRG(base) \
>+	_MMIO((base) + GEN12_OAM_MMIO_TRG_OFFSET)
>+
>+#define GEN12_OAM_HEAD_POINTER(base) \
>+	_MMIO((base) + GEN12_OAM_HEAD_POINTER_OFFSET)
>+#define GEN12_OAM_TAIL_POINTER(base) \
>+	_MMIO((base) + GEN12_OAM_TAIL_POINTER_OFFSET)
>+#define GEN12_OAM_BUFFER(base) \
>+	_MMIO((base) + GEN12_OAM_BUFFER_OFFSET)
>+#define GEN12_OAM_CONTEXT_CONTROL(base) \
>+	_MMIO((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET)
>+#define GEN12_OAM_CONTROL(base) \
>+	_MMIO((base) + GEN12_OAM_CONTROL_OFFSET)
>+#define GEN12_OAM_DEBUG(base) \
>+	_MMIO((base) + GEN12_OAM_DEBUG_OFFSET)
>+#define GEN12_OAM_STATUS(base) \
>+	_MMIO((base) + GEN12_OAM_STATUS_OFFSET)
>+
>+#define GEN12_OAM_CEC0_0_OFFSET		(0x40)
>+#define GEN12_OAM_CEC7_1_OFFSET		(0x7c)
>+#define GEN12_OAM_CEC0_0(base) \
>+	_MMIO((base) + GEN12_OAM_CEC0_0_OFFSET)
>+#define GEN12_OAM_CEC7_1(base) \
>+	_MMIO((base) + GEN12_OAM_CEC7_1_OFFSET)
>+
>+#define GEN12_OAM_STARTTRIG1_OFFSET	(0x00)
>+#define GEN12_OAM_STARTTRIG8_OFFSET	(0x1c)
>+#define GEN12_OAM_STARTTRIG1(base) \
>+	_MMIO((base) + GEN12_OAM_STARTTRIG1_OFFSET)
>+#define GEN12_OAM_STARTTRIG8(base) \
>+	_MMIO((base) + GEN12_OAM_STARTTRIG8_OFFSET)
>+
>+#define GEN12_OAM_REPORTTRIG1_OFFSET	(0x20)
>+#define GEN12_OAM_REPORTTRIG8_OFFSET	(0x3c)
>+#define GEN12_OAM_REPORTTRIG1(base) \
>+	_MMIO((base) + GEN12_OAM_REPORTTRIG1_OFFSET)
>+#define GEN12_OAM_REPORTTRIG8(base) \
>+	_MMIO((base) + GEN12_OAM_REPORTTRIG8_OFFSET)
>+
>+#define GEN12_OAM_PERF_COUNTER_B0_OFFSET	(0x84)
>+#define GEN12_OAM_PERF_COUNTER_B(base, idx) \
>+	_MMIO((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx))
>+
> #endif /* __INTEL_PERF_OA_REGS__ */
>diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
>index 8ccb0b89d019..5b2c3bab60f8 100644
>--- a/drivers/gpu/drm/i915/i915_perf_types.h
>+++ b/drivers/gpu/drm/i915/i915_perf_types.h
>@@ -20,6 +20,7 @@
> #include "gt/intel_engine_types.h"
> #include "gt/intel_sseu.h"
> #include "i915_reg_defs.h"
>+#include "intel_uncore.h"
> #include "intel_wakeref.h"
>
> struct drm_i915_private;
>@@ -33,6 +34,7 @@ struct intel_engine_cs;
>
> enum {
> 	PERF_GROUP_OAG = 0,
>+	PERF_GROUP_OAM_SAMEDIA_0 = 0,
>
> 	PERF_GROUP_MAX,
> 	PERF_GROUP_INVALID = U32_MAX,
>@@ -43,9 +45,27 @@ enum report_header {
> 	HDR_64_BIT,
> };
>
>+struct i915_perf_regs {
>+	u32 base;
>+	i915_reg_t oa_head_ptr;
>+	i915_reg_t oa_tail_ptr;
>+	i915_reg_t oa_buffer;
>+	i915_reg_t oa_ctx_ctrl;
>+	i915_reg_t oa_ctrl;
>+	i915_reg_t oa_debug;
>+	i915_reg_t oa_status;
>+	u32 oa_ctrl_counter_format_shift;
>+};
>+
>+enum {
>+	TYPE_OAG,
>+	TYPE_OAM,
>+};
>+
> struct i915_oa_format {
> 	u32 format;
> 	int size;
>+	int type;
> 	enum report_header header;
> };
>
>@@ -317,6 +337,11 @@ struct i915_perf_stream {
> 		 * @tail: The last verified tail that can be read by userspace.
> 		 */
> 		u32 tail;
>+
>+		/**
>+		 * @group: The group object for this OA buffer.
>+		 */
>+		struct i915_perf_group *group;
> 	} oa_buffer;
>
> 	/**
>@@ -431,6 +456,21 @@ struct i915_perf_group {
> 	 * @engine_mask: A mask of engines using a single OA buffer.
> 	 */
> 	intel_engine_mask_t engine_mask;
>+
>+	/*
>+	 * @regs: OA buffer register group for programming the OA unit.
>+	 */
>+	struct i915_perf_regs regs;
>+
>+	/*
>+	 * @type: Type of OA buffer, OAM, OAG etc.
>+	 */
>+	int type;
>+
>+	/*
>+	 * @fw_domains: forcewake domains required for this group.
>+	 */
>+	enum forcewake_domains fw_domains;
> };
>
> struct i915_perf_gt {
>diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
>index 80bda653d61b..45e218327f44 100644
>--- a/drivers/gpu/drm/i915/intel_device_info.h
>+++ b/drivers/gpu/drm/i915/intel_device_info.h
>@@ -166,6 +166,7 @@ enum intel_ppgtt_type {
> 	func(has_mslice_steering); \
> 	func(has_oa_bpc_reporting); \
> 	func(has_oa_slice_contrib_limits); \
>+	func(has_oam); \
> 	func(has_one_eu_per_fuse_bit); \
> 	func(has_pxp); \
> 	func(has_rc6); \
>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>index b6922b52d85c..70bfa6530dbc 100644
>--- a/include/uapi/drm/i915_drm.h
>+++ b/include/uapi/drm/i915_drm.h
>@@ -2676,6 +2676,10 @@ enum drm_i915_oa_format {
> 	I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
> 	I915_OA_FORMAT_A24u40_A14u32_B8_C8,
>
>+	/* MTL OAM */
>+	I915_OAM_FORMAT_MPEC8u64_B8_C8,
>+	I915_OAM_FORMAT_MPEC8u32_B8_C8,
>+
> 	I915_OA_FORMAT_MAX	    /* non-ABI */
> };
>
>-- 
>2.36.1
>
Ashutosh Dixit Feb. 23, 2023, 8:05 p.m. UTC | #2
On Thu, 16 Feb 2023 16:58:50 -0800, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> MTL introduces additional OA units dedicated to media use cases. Add
> support for programming these OA units by passing the media engine class
> and instance parameters.
>
> UMD specific changes for GPUvis support:
> https://patchwork.freedesktop.org/patch/522827/?series=114023
> https://patchwork.freedesktop.org/patch/522822/?series=114023
> https://patchwork.freedesktop.org/patch/522826/?series=114023
> https://patchwork.freedesktop.org/patch/522828/?series=114023
> https://patchwork.freedesktop.org/patch/522816/?series=114023
> https://patchwork.freedesktop.org/patch/522825/?series=114023

General comment about the patch in case I miss something out, as I've
mentioned previously in general let's try to replace INTEL_METEORLAKE and
IS_METEORLAKE checks in the patch with:

	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))

So that we don't have to enumerate each platform individually later.

> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h          |   2 +
>  drivers/gpu/drm/i915/i915_pci.c          |   1 +
>  drivers/gpu/drm/i915/i915_perf.c         | 247 ++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++
>  drivers/gpu/drm/i915/i915_perf_types.h   |  40 ++++
>  drivers/gpu/drm/i915/intel_device_info.h |   1 +
>  include/uapi/drm/i915_drm.h              |   4 +
>  7 files changed, 347 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0393273faa09..f3cacbf41c86 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -856,6 +856,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
>	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
>  #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
>	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
> +#define HAS_OAM(dev_priv) \
> +	(INTEL_INFO(dev_priv)->has_oam)
>
>  /*
>   * Set this flag, when platform requires 64K GTT page sizes or larger for
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index a8d942b16223..621730b6551c 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -1028,6 +1028,7 @@ static const struct intel_device_info adl_p_info = {
>	.has_mslice_steering = 1, \
>	.has_oa_bpc_reporting = 1, \
>	.has_oa_slice_contrib_limits = 1, \
> +	.has_oam = 1, \
>	.has_rc6 = 1, \
>	.has_reset_engine = 1, \
>	.has_rps = 1, \
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index f028df812067..a57690f4c531 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -192,6 +192,7 @@
>   */
>
>  #include <linux/anon_inodes.h>
> +#include <linux/nospec.h>
>  #include <linux/sizes.h>
>  #include <linux/uuid.h>
>
> @@ -326,6 +327,13 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
>	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
>	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
>	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
> +	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
> +	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
> +};
> +
> +/* PERF_GROUP_OAG is unused for oa_base, drop it for mtl */

What does this comment mean?

> +static const u32 mtl_oa_base[] = {
> +	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
>  };
>
>  #define SAMPLE_OA_REPORT      (1<<0)
> @@ -418,11 +426,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
>	kfree(oa_bo);
>  }
>
> +static inline const
> +struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
> +{
> +	return &stream->oa_buffer.group->regs;

Should just use stream->engine->oa_group->regs, see near the bottom.

> +}
> +
>  static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
>  {
>	struct intel_uncore *uncore = stream->uncore;
>
> -	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
> +	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
>	       GEN12_OAG_OATAILPTR_MASK;
>  }
>
> @@ -886,7 +900,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>		i915_reg_t oaheadptr;
>
>		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?

>= 12 ?

> -			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
> +			    __oa_regs(stream)->oa_head_ptr :
> +			    GEN8_OAHEADPTR;
>
>		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>
> @@ -939,7 +954,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
>		return -EIO;
>
>	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?

>= 12 ?

> -		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
> +		       __oa_regs(stream)->oa_status :
> +		       GEN8_OASTATUS;
>
>	oastatus = intel_uncore_read(uncore, oastatus_reg);
>
> @@ -1643,16 +1659,46 @@ free_noa_wait(struct i915_perf_stream *stream)
>	i915_vma_unpin_and_release(&stream->noa_wait, 0);
>  }
>
> +/*
> + * intel_engine_lookup_user ensures that most of engine specific checks are
> + * taken care of, however, we can run into a case where the OA unit catering to
> + * the engine passed by the user is disabled for some reason. In such cases,
> + * ensure oa unit corresponding to an engine is functional. If there are no
> + * engines in the group, the unit is disabled.
> + */
> +static bool oa_unit_functional(const struct intel_engine_cs *engine)
> +{
> +	return engine->oa_group && engine->oa_group->num_engines;
> +}
> +
>  static bool engine_supports_oa(const struct intel_engine_cs *engine)
>  {
>	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
>
>	switch (platform) {
> +	case INTEL_METEORLAKE:
> +		return engine->class == RENDER_CLASS ||
> +		       ((engine->class == VIDEO_DECODE_CLASS ||
> +			 engine->class == VIDEO_ENHANCEMENT_CLASS) &&
> +			engine->gt->type == GT_MEDIA);
>	default:
>		return engine->class == RENDER_CLASS;
>	}

As mentioned in a previous patch, this could just be:

	return engine->oa_group;

Because all these checks have already been done when the perf groups were
initialized so let's use that, as is done for oa_unit_functional.

Though, caution, to return engine->oa_group we'd have to remove
engine_supports_oa from __oa_engine_group, since engine->oa_group is not
yet assigned there. But I think the engine_supports_oa check in
__oa_engine_group is a duplication and should be removed.

>  }
>
> +static bool engine_class_supports_oa_format(struct intel_engine_cs *engine, int type)
> +{
> +	switch (engine->class) {
> +	case RENDER_CLASS:
> +		return type == TYPE_OAG;
> +	case VIDEO_DECODE_CLASS:
> +	case VIDEO_ENHANCEMENT_CLASS:
> +		return type == TYPE_OAM;
> +	default:
> +		return false;
> +	}
> +}
> +

Again, how about:

	return engine->oa_group && engine->oa_group->type == type;

Otherwise as mentioned below oa_group->type is unused and also incorrectly
assigned at present. The format type and group types are the same
(TYPE_OAG/TYPE_OAM). Can name the function engine_supports_oa_format.

>  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>  {
>	struct i915_perf *perf = stream->perf;
> @@ -1680,7 +1726,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>		drm_WARN_ON(&gt->i915->drm,
>			    intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc));
>
> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
>	intel_engine_pm_put(stream->engine);
>
>	if (stream->ctx)
> @@ -1804,8 +1850,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>
>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>
> -	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
> -	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
>			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
>	stream->oa_buffer.head = gtt_offset;
>
> @@ -1817,9 +1863,9 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>	 *  to enable proper functionality of the overflow
>	 *  bit."
>	 */
> -	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
>			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
> -	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
>			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>
>	/* Mark that we need updated tail pointers to read from... */
> @@ -2579,7 +2625,8 @@ gen8_modify_self(struct intel_context *ce,
>	return err;
>  }
>
> -static int gen8_configure_context(struct i915_gem_context *ctx,
> +static int gen8_configure_context(struct i915_perf_stream *stream,
> +				  struct i915_gem_context *ctx,
>				  struct flex *flex, unsigned int count)
>  {
>	struct i915_gem_engines_iter it;
> @@ -2589,7 +2636,8 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
>	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
>		GEM_BUG_ON(ce == ce->engine->kernel_context);
>
> -		if (!engine_supports_oa(ce->engine))
> +		if (!engine_supports_oa(ce->engine) ||
> +		    ce->engine->class != stream->engine->class)
>			continue;
>
>		/* Otherwise OA settings will be set upon first use */
> @@ -2720,7 +2768,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
>
>		spin_unlock(&i915->gem.contexts.lock);
>
> -		err = gen8_configure_context(ctx, regs, num_regs);
> +		err = gen8_configure_context(stream, ctx, regs, num_regs);
>		if (err) {
>			i915_gem_context_put(ctx);
>			return err;
> @@ -2740,7 +2788,8 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
>	for_each_uabi_engine(engine, i915) {
>		struct intel_context *ce = engine->kernel_context;
>
> -		if (!engine_supports_oa(ce->engine))
> +		if (!engine_supports_oa(ce->engine) ||
> +		    ce->engine->class != stream->engine->class)
>			continue;
>
>		regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu);
> @@ -2765,6 +2814,9 @@ gen12_configure_all_contexts(struct i915_perf_stream *stream,
>		},
>	};
>
> +	if (stream->engine->class != RENDER_CLASS)
> +		return 0;

OK, this is for render, nothing equivalent needed for media?

> +
>	return oa_configure_all_contexts(stream,
>					 regs, ARRAY_SIZE(regs),
>					 active);
> @@ -2894,7 +2946,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
>				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
>	}
>
> -	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
>			   /* Disable clk ratio reports, like previous Gens. */
>			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
>					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
> @@ -2904,7 +2956,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
>			    */
>			   oag_report_ctx_switches(stream));
>
> -	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
>			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
>			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
>			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
> @@ -3058,8 +3110,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
>
>  static void gen12_oa_enable(struct i915_perf_stream *stream)
>  {
> -	struct intel_uncore *uncore = stream->uncore;
> -	u32 report_format = stream->oa_buffer.format->format;
> +	const struct i915_perf_regs *regs;
> +	u32 val;
>
>	/*
>	 * If we don't want OA reports from the OA buffer, then we don't even
> @@ -3070,9 +3122,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream)
>
>	gen12_init_oa_buffer(stream);
>
> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
> -			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
> -			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
> +	regs = __oa_regs(stream);
> +	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
> +	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
> +
> +	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
>  }
>
>  /**
> @@ -3124,9 +3178,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream)
>  {
>	struct intel_uncore *uncore = stream->uncore;
>
> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
>	if (intel_wait_for_register(uncore,
> -				    GEN12_OAG_OACONTROL,
> +				    __oa_regs(stream)->oa_ctrl,
>				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
>				    50))
>		drm_err(&stream->perf->i915->drm,
> @@ -3329,6 +3383,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>
>	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
>
> +	stream->oa_buffer.group = g;

Should just use stream->engine->oa_group, see near the bottom.

>	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
>	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0))
>		return -EINVAL;
> @@ -3379,7 +3434,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>	 *   references will effectively disable RC6.
>	 */
>	intel_engine_pm_get(stream->engine);
> -	intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
> +	intel_uncore_forcewake_get(stream->uncore, g->fw_domains);
>
>	/*
>	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
> @@ -3440,7 +3495,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>		intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc);
>
>  err_gucrc:
> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
>	intel_engine_pm_put(stream->engine);
>
>	free_oa_configs(stream);
> @@ -4033,6 +4088,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
>				    struct perf_open_properties *props)
>  {
>	struct drm_i915_gem_context_param_sseu user_sseu;
> +	const struct i915_oa_format *f;
>	u64 __user *uprop = uprops;
>	bool config_sseu = false;
>	u8 class, instance;
> @@ -4203,6 +4259,17 @@ static int read_properties_unlocked(struct i915_perf *perf,
>	if (!engine_supports_oa(props->engine))
>		return -EINVAL;
>
> +	if (!oa_unit_functional(props->engine))
> +		return -ENODEV;
> +
> +	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);

Why do we need this (something to do with speculation)? Can just do
'&perf->oa_formats[props->oa_format]' below? The format passed in has
already been checked in the switch statement above.

> +	f = &perf->oa_formats[i];
> +	if (!engine_class_supports_oa_format(props->engine, f->type)) {
> +		DRM_DEBUG("Invalid OA format %d for class %d\n",
> +			  f->type, props->engine->class);
> +		return -EINVAL;
> +	}
> +
>	if (config_sseu) {
>		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
>		if (ret) {
> @@ -4383,6 +4450,14 @@ static const struct i915_range gen12_oa_b_counters[] = {
>	{}
>  };
>
> +static const struct i915_range mtl_oam_b_counters[] = {
> +	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
> +	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
> +	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
> +	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
> +	{}
> +};
> +
>  static const struct i915_range xehp_oa_b_counters[] = {
>	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
>	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
> @@ -4429,13 +4504,16 @@ static const struct i915_range gen12_oa_mux_regs[] = {
>
>  /*
>   * Ref: 14010536224:
> - * 0x20cc is repurposed on MTL, so use a separate array for MTL.
> + * 0x20cc is repurposed on MTL, so use a separate array for MTL. Also add the
> + * MPES/MPEC registers.

MPES/MPEC registers are added above now, not here so maybe get rid of the
comment change above?

>   */
>  static const struct i915_range mtl_oa_mux_regs[] = {
>	{ .start = 0x0d00, .end = 0x0d04 },	/* RPM_CONFIG[0-1] */
>	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
>	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
>	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
> +	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
> +	{}
>  };
>
>  static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> @@ -4473,10 +4551,26 @@ static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>	return reg_in_range_table(addr, gen12_oa_b_counters);
>  }
>
> +static bool xehp_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
> +{
> +	enum intel_platform platform = INTEL_INFO(perf->i915)->platform;
> +
> +	if (!HAS_OAM(perf->i915))
> +		return false;
> +
> +	switch (platform) {
> +	case INTEL_METEORLAKE:
> +		return reg_in_range_table(addr, mtl_oam_b_counters);
> +	default:
> +		return false;
> +	}

Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', registers
are identical in later platforms too.

Should the function prefix be xehp or mtl? Don't see xehp in bspec,
probably xehp is discontinued.

> +}
> +
>  static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>  {
>	return reg_in_range_table(addr, xehp_oa_b_counters) ||
> -		reg_in_range_table(addr, gen12_oa_b_counters);
> +		reg_in_range_table(addr, gen12_oa_b_counters) ||
> +		xehp_is_valid_oam_b_counter_addr(perf, addr);
>  }
>
>  static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
> @@ -4846,11 +4940,39 @@ static u32 __num_perf_groups_per_gt(struct intel_gt *gt)
>	enum intel_platform platform = INTEL_INFO(gt->i915)->platform;
>
>	switch (platform) {
> +	case INTEL_METEORLAKE:
> +		return 1;

I don't think we need this, as proposed previously maybe the function
should just unconditionally return 1.

>	default:
>		return 1;
>	}
>  }
>
> +static u32 __oam_engine_group(struct intel_engine_cs *engine)
> +{
> +	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
> +	struct intel_gt *gt = engine->gt;
> +	u32 group = PERF_GROUP_INVALID;
> +
> +	switch (platform) {
> +	case INTEL_METEORLAKE:

Replace here with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))'.

> +		/*
> +		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
> +		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
> +		 */
> +		drm_WARN_ON(&engine->i915->drm,
> +			    engine->gt->type != GT_MEDIA);
> +
> +		group = PERF_GROUP_OAM_SAMEDIA_0;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));
> +
> +	return group;
> +}
> +
>  static u32 __oa_engine_group(struct intel_engine_cs *engine)
>  {
>	if (!engine_supports_oa(engine))

As mentioned above for engine_supports_oa, this looks like a duplication of
the checks below and should probably be removed.

> @@ -4860,11 +4982,58 @@ static u32 __oa_engine_group(struct intel_engine_cs *engine)
>	case RENDER_CLASS:
>		return PERF_GROUP_OAG;
>
> +	case VIDEO_DECODE_CLASS:
> +	case VIDEO_ENHANCEMENT_CLASS:
> +		return __oam_engine_group(engine);
> +
>	default:
>		return PERF_GROUP_INVALID;
>	}
>  }
>
> +static struct i915_perf_regs __oam_regs(u32 base)
> +{
> +	return (struct i915_perf_regs) {
> +		base,
> +		GEN12_OAM_HEAD_POINTER(base),
> +		GEN12_OAM_TAIL_POINTER(base),
> +		GEN12_OAM_BUFFER(base),
> +		GEN12_OAM_CONTEXT_CONTROL(base),
> +		GEN12_OAM_CONTROL(base),
> +		GEN12_OAM_DEBUG(base),
> +		GEN12_OAM_STATUS(base),
> +		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
> +	};
> +}
> +
> +static struct i915_perf_regs __oag_regs(void)
> +{
> +	return (struct i915_perf_regs) {
> +		0,
> +		GEN12_OAG_OAHEADPTR,
> +		GEN12_OAG_OATAILPTR,
> +		GEN12_OAG_OABUFFER,
> +		GEN12_OAG_OAGLBCTXCTRL,
> +		GEN12_OAG_OACONTROL,
> +		GEN12_OAG_OA_DEBUG,
> +		GEN12_OAG_OASTATUS,
> +		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
> +	};
> +}
> +
> +static void oa_init_regs(struct intel_gt *gt, u32 id)
> +{
> +	struct i915_perf_group *group = &gt->perf.group[id];
> +	struct i915_perf_regs *regs = &group->regs;
> +
> +	if (id == PERF_GROUP_OAG && gt->type != GT_MEDIA)
> +		*regs = __oag_regs();
> +	else if (IS_METEORLAKE(gt->i915))

Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', OAM
registers are identical in later platforms too. Maybe get rid of drm_WARN
below?

> +		*regs = __oam_regs(mtl_oa_base[id]);
> +	else
> +		drm_WARN(&gt->i915->drm, 1, "Unsupported platform for OA\n");
> +}
> +
>  static void oa_init_groups(struct intel_gt *gt)
>  {
>	int i, num_groups = gt->perf.num_perf_groups;
> @@ -4881,6 +5050,24 @@ static void oa_init_groups(struct intel_gt *gt)
>		g->oa_unit_id = perf->oa_unit_ids++;
>
>		g->gt = gt;
> +		oa_init_regs(gt, i);
> +		g->fw_domains = FORCEWAKE_ALL;
> +		if (i == PERF_GROUP_OAG) {
> +			g->type = TYPE_OAG;
> +
> +			/*
> +			 * Enabling all fw domains for OAG caps the max GT
> +			 * frequency to media FF max. This could be less than
> +			 * what the user sets through the sysfs and perf
> +			 * measurements could be skewed. Since some platforms
> +			 * have separate OAM units to measure media perf, do not
> +			 * enable media fw domains for OAG.
> +			 */
> +			if (HAS_OAM(gt->i915))
> +				g->fw_domains = FORCEWAKE_GT | FORCEWAKE_RENDER;

Is this needed even when media and render are separate tiles, which is the
only case we have in this code right now? For separate tiles setting
FORCEWAKE_ALL should not cap the freq, correct?

If not needed we can get rid of g->fw_domains.

> +		} else {
> +			g->type = TYPE_OAM;

This is wrong, because num_perf_groups is 1. So the type should be assigned
not based on i (which is always 0) but maybe similar to what is done in
oa_init_regs above.

We are escaping because g->type is unused as mentioned below.

> +		}
>	}
>  }
>
> @@ -4970,9 +5157,15 @@ static void oa_init_supported_formats(struct i915_perf *perf)
>		break;
>
>	case INTEL_DG2:
> +		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
> +		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
> +		break;
> +
>	case INTEL_METEORLAKE:
>		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
>		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
> +		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u64_B8_C8);
> +		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u32_B8_C8);
>		break;
>
>	default:
> @@ -5217,8 +5410,10 @@ int i915_perf_ioctl_version(void)
>	 *
>	 * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
>	 *    DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
> +	 *
> +	 * 7: Add support for video decode and enhancement classes.
>	 */
> -	return 6;
> +	return 7;

Let's figure out if we want to club all this into 6.

>  }
>
>  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> index 381d94101610..ba103875e19f 100644
> --- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> +++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> @@ -138,4 +138,82 @@
>  #define   GEN12_SQCNT1_PMON_ENABLE		REG_BIT(30)
>  #define   GEN12_SQCNT1_OABPC			REG_BIT(29)
>
> +/* Gen12 OAM unit */
> +#define GEN12_OAM_HEAD_POINTER_OFFSET   (0x1a0)
> +#define  GEN12_OAM_HEAD_POINTER_MASK    0xffffffc0
> +
> +#define GEN12_OAM_TAIL_POINTER_OFFSET   (0x1a4)
> +#define  GEN12_OAM_TAIL_POINTER_MASK    0xffffffc0
> +
> +#define GEN12_OAM_BUFFER_OFFSET         (0x1a8)
> +#define  GEN12_OAM_BUFFER_SIZE_MASK     (0x7)
> +#define  GEN12_OAM_BUFFER_SIZE_SHIFT    (3)
> +#define  GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */
> +
> +#define GEN12_OAM_CONTEXT_CONTROL_OFFSET              (0x1bc)
> +#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2
> +#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE       REG_BIT(1)
> +#define  GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME     REG_BIT(0)
> +
> +#define GEN12_OAM_CONTROL_OFFSET                (0x194)
> +#define  GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1
> +#define  GEN12_OAM_CONTROL_COUNTER_ENABLE       REG_BIT(0)
> +
> +#define GEN12_OAM_DEBUG_OFFSET                      (0x198)
> +#define  GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT         REG_BIT(12)
> +#define  GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO          REG_BIT(6)
> +#define  GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS  REG_BIT(5)
> +#define  GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS     REG_BIT(2)
> +#define  GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1)
> +
> +#define GEN12_OAM_STATUS_OFFSET            (0x19c)
> +#define  GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2)
> +#define  GEN12_OAM_STATUS_BUFFER_OVERFLOW  REG_BIT(1)
> +#define  GEN12_OAM_STATUS_REPORT_LOST      REG_BIT(0)
> +
> +#define GEN12_OAM_MMIO_TRG_OFFSET	(0x1d0)
> +
> +#define GEN12_OAM_MMIO_TRG(base) \
> +	_MMIO((base) + GEN12_OAM_MMIO_TRG_OFFSET)
> +
> +#define GEN12_OAM_HEAD_POINTER(base) \
> +	_MMIO((base) + GEN12_OAM_HEAD_POINTER_OFFSET)
> +#define GEN12_OAM_TAIL_POINTER(base) \
> +	_MMIO((base) + GEN12_OAM_TAIL_POINTER_OFFSET)
> +#define GEN12_OAM_BUFFER(base) \
> +	_MMIO((base) + GEN12_OAM_BUFFER_OFFSET)
> +#define GEN12_OAM_CONTEXT_CONTROL(base) \
> +	_MMIO((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET)
> +#define GEN12_OAM_CONTROL(base) \
> +	_MMIO((base) + GEN12_OAM_CONTROL_OFFSET)
> +#define GEN12_OAM_DEBUG(base) \
> +	_MMIO((base) + GEN12_OAM_DEBUG_OFFSET)
> +#define GEN12_OAM_STATUS(base) \
> +	_MMIO((base) + GEN12_OAM_STATUS_OFFSET)
> +
> +#define GEN12_OAM_CEC0_0_OFFSET		(0x40)
> +#define GEN12_OAM_CEC7_1_OFFSET		(0x7c)
> +#define GEN12_OAM_CEC0_0(base) \
> +	_MMIO((base) + GEN12_OAM_CEC0_0_OFFSET)
> +#define GEN12_OAM_CEC7_1(base) \
> +	_MMIO((base) + GEN12_OAM_CEC7_1_OFFSET)
> +
> +#define GEN12_OAM_STARTTRIG1_OFFSET	(0x00)
> +#define GEN12_OAM_STARTTRIG8_OFFSET	(0x1c)
> +#define GEN12_OAM_STARTTRIG1(base) \
> +	_MMIO((base) + GEN12_OAM_STARTTRIG1_OFFSET)
> +#define GEN12_OAM_STARTTRIG8(base) \
> +	_MMIO((base) + GEN12_OAM_STARTTRIG8_OFFSET)
> +
> +#define GEN12_OAM_REPORTTRIG1_OFFSET	(0x20)
> +#define GEN12_OAM_REPORTTRIG8_OFFSET	(0x3c)
> +#define GEN12_OAM_REPORTTRIG1(base) \
> +	_MMIO((base) + GEN12_OAM_REPORTTRIG1_OFFSET)
> +#define GEN12_OAM_REPORTTRIG8(base) \
> +	_MMIO((base) + GEN12_OAM_REPORTTRIG8_OFFSET)
> +
> +#define GEN12_OAM_PERF_COUNTER_B0_OFFSET	(0x84)
> +#define GEN12_OAM_PERF_COUNTER_B(base, idx) \
> +	_MMIO((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx))
> +
>  #endif /* __INTEL_PERF_OA_REGS__ */
> diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
> index 8ccb0b89d019..5b2c3bab60f8 100644
> --- a/drivers/gpu/drm/i915/i915_perf_types.h
> +++ b/drivers/gpu/drm/i915/i915_perf_types.h
> @@ -20,6 +20,7 @@
>  #include "gt/intel_engine_types.h"
>  #include "gt/intel_sseu.h"
>  #include "i915_reg_defs.h"
> +#include "intel_uncore.h"
>  #include "intel_wakeref.h"
>
>  struct drm_i915_private;
> @@ -33,6 +34,7 @@ struct intel_engine_cs;
>
>  enum {
>	PERF_GROUP_OAG = 0,
> +	PERF_GROUP_OAM_SAMEDIA_0 = 0,
>
>	PERF_GROUP_MAX,
>	PERF_GROUP_INVALID = U32_MAX,
> @@ -43,9 +45,27 @@ enum report_header {
>	HDR_64_BIT,
>  };
>
> +struct i915_perf_regs {
> +	u32 base;
> +	i915_reg_t oa_head_ptr;
> +	i915_reg_t oa_tail_ptr;
> +	i915_reg_t oa_buffer;
> +	i915_reg_t oa_ctx_ctrl;
> +	i915_reg_t oa_ctrl;
> +	i915_reg_t oa_debug;
> +	i915_reg_t oa_status;
> +	u32 oa_ctrl_counter_format_shift;
> +};
> +
> +enum {

enum oa_type?

> +	TYPE_OAG,
> +	TYPE_OAM,
> +};
> +
>  struct i915_oa_format {
>	u32 format;
>	int size;
> +	int type;
>	enum report_header header;
>  };
>
> @@ -317,6 +337,11 @@ struct i915_perf_stream {
>		 * @tail: The last verified tail that can be read by userspace.
>		 */
>		u32 tail;
> +
> +		/**
> +		 * @group: The group object for this OA buffer.
> +		 */
> +		struct i915_perf_group *group;

Isn't this just stream->engine->oa_group, so let's use that instead rather
than duplicating?

>	} oa_buffer;
>
>	/**
> @@ -431,6 +456,21 @@ struct i915_perf_group {
>	 * @engine_mask: A mask of engines using a single OA buffer.
>	 */
>	intel_engine_mask_t engine_mask;
> +
> +	/*
> +	 * @regs: OA buffer register group for programming the OA unit.
> +	 */
> +	struct i915_perf_regs regs;
> +
> +	/*
> +	 * @type: Type of OA buffer, OAM, OAG etc.

s/OA buffer/OA unit/

> +	 */
> +	int type;

Also (incorrectly) assigned but currently unused unless we make the change
to engine_class_supports_oa_format mentioned above. But I think we should
retain and use this.

> +
> +	/*
> +	 * @fw_domains: forcewake domains required for this group.
> +	 */
> +	enum forcewake_domains fw_domains;

Let's see if we need this.

>  };
>
>  struct i915_perf_gt {
> diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
> index 80bda653d61b..45e218327f44 100644
> --- a/drivers/gpu/drm/i915/intel_device_info.h
> +++ b/drivers/gpu/drm/i915/intel_device_info.h
> @@ -166,6 +166,7 @@ enum intel_ppgtt_type {
>	func(has_mslice_steering); \
>	func(has_oa_bpc_reporting); \
>	func(has_oa_slice_contrib_limits); \
> +	func(has_oam); \
>	func(has_one_eu_per_fuse_bit); \
>	func(has_pxp); \
>	func(has_rc6); \
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index b6922b52d85c..70bfa6530dbc 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -2676,6 +2676,10 @@ enum drm_i915_oa_format {
>	I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
>	I915_OA_FORMAT_A24u40_A14u32_B8_C8,
>
> +	/* MTL OAM */
> +	I915_OAM_FORMAT_MPEC8u64_B8_C8,
> +	I915_OAM_FORMAT_MPEC8u32_B8_C8,
> +
>	I915_OA_FORMAT_MAX	    /* non-ABI */
>  };

Thanks.
--
Ashutosh
Umesh Nerlige Ramappa Feb. 25, 2023, 12:58 a.m. UTC | #3
On Thu, Feb 23, 2023 at 12:05:02PM -0800, Dixit, Ashutosh wrote:
>On Thu, 16 Feb 2023 16:58:50 -0800, Umesh Nerlige Ramappa wrote:
>>
>
>Hi Umesh,
>
>> MTL introduces additional OA units dedicated to media use cases. Add
>> support for programming these OA units by passing the media engine class
>> and instance parameters.
>>
>> UMD specific changes for GPUvis support:
>> https://patchwork.freedesktop.org/patch/522827/?series=114023
>> https://patchwork.freedesktop.org/patch/522822/?series=114023
>> https://patchwork.freedesktop.org/patch/522826/?series=114023
>> https://patchwork.freedesktop.org/patch/522828/?series=114023
>> https://patchwork.freedesktop.org/patch/522816/?series=114023
>> https://patchwork.freedesktop.org/patch/522825/?series=114023
>
>General comment about the patch in case I miss something out, as I've
>mentioned previously in general let's try to replace INTEL_METEORLAKE and
>IS_METEORLAKE checks in the patch with:
>
>	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
>
>So that we don't have to enumerate each platform individually later.

Hmm, I recall that you had already commented about this at some point, 
sorry I missed that. Do you suggest I add this change in places outside 
this patch as well?

>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h          |   2 +
>>  drivers/gpu/drm/i915/i915_pci.c          |   1 +
>>  drivers/gpu/drm/i915/i915_perf.c         | 247 ++++++++++++++++++++---
>>  drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++
>>  drivers/gpu/drm/i915/i915_perf_types.h   |  40 ++++
>>  drivers/gpu/drm/i915/intel_device_info.h |   1 +
>>  include/uapi/drm/i915_drm.h              |   4 +
>>  7 files changed, 347 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 0393273faa09..f3cacbf41c86 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -856,6 +856,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
>>	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
>>  #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
>>	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
>> +#define HAS_OAM(dev_priv) \
>> +	(INTEL_INFO(dev_priv)->has_oam)
>>
>>  /*
>>   * Set this flag, when platform requires 64K GTT page sizes or larger for
>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>> index a8d942b16223..621730b6551c 100644
>> --- a/drivers/gpu/drm/i915/i915_pci.c
>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>> @@ -1028,6 +1028,7 @@ static const struct intel_device_info adl_p_info = {
>>	.has_mslice_steering = 1, \
>>	.has_oa_bpc_reporting = 1, \
>>	.has_oa_slice_contrib_limits = 1, \
>> +	.has_oam = 1, \
>>	.has_rc6 = 1, \
>>	.has_reset_engine = 1, \
>>	.has_rps = 1, \
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index f028df812067..a57690f4c531 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -192,6 +192,7 @@
>>   */
>>
>>  #include <linux/anon_inodes.h>
>> +#include <linux/nospec.h>
>>  #include <linux/sizes.h>
>>  #include <linux/uuid.h>
>>
>> @@ -326,6 +327,13 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
>>	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
>>	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
>>	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
>> +	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
>> +	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
>> +};
>> +
>> +/* PERF_GROUP_OAG is unused for oa_base, drop it for mtl */
>
>What does this comment mean?

There are multiple OAM units and the base for each is used to calculate 
the OA regs mmio address. OAG is just one unit with the same addresses 
for the regs, so we don't use this array that initializes the bases for 
OA units. Maybe I will add this in the comment here.

>
>> +static const u32 mtl_oa_base[] = {
>> +	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
>>  };
>>
>>  #define SAMPLE_OA_REPORT      (1<<0)
>> @@ -418,11 +426,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
>>	kfree(oa_bo);
>>  }
>>
>> +static inline const
>> +struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
>> +{
>> +	return &stream->oa_buffer.group->regs;
>
>Should just use stream->engine->oa_group->regs, see near the bottom.
>
>> +}
>> +
>>  static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
>>  {
>>	struct intel_uncore *uncore = stream->uncore;
>>
>> -	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
>> +	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
>>	       GEN12_OAG_OATAILPTR_MASK;
>>  }
>>
>> @@ -886,7 +900,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>>		i915_reg_t oaheadptr;
>>
>>		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?
>
>>= 12 ?
>
>> -			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
>> +			    __oa_regs(stream)->oa_head_ptr :
>> +			    GEN8_OAHEADPTR;
>>
>>		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>>
>> @@ -939,7 +954,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
>>		return -EIO;
>>
>>	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?
>
>>= 12 ?
>
>> -		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
>> +		       __oa_regs(stream)->oa_status :
>> +		       GEN8_OASTATUS;
>>
>>	oastatus = intel_uncore_read(uncore, oastatus_reg);
>>
>> @@ -1643,16 +1659,46 @@ free_noa_wait(struct i915_perf_stream *stream)
>>	i915_vma_unpin_and_release(&stream->noa_wait, 0);
>>  }
>>
>> +/*
>> + * intel_engine_lookup_user ensures that most of engine specific checks are
>> + * taken care of, however, we can run into a case where the OA unit catering to
>> + * the engine passed by the user is disabled for some reason. In such cases,
>> + * ensure oa unit corresponding to an engine is functional. If there are no
>> + * engines in the group, the unit is disabled.
>> + */
>> +static bool oa_unit_functional(const struct intel_engine_cs *engine)
>> +{
>> +	return engine->oa_group && engine->oa_group->num_engines;
>> +}
>> +
>>  static bool engine_supports_oa(const struct intel_engine_cs *engine)
>>  {
>>	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
>>
>>	switch (platform) {
>> +	case INTEL_METEORLAKE:
>> +		return engine->class == RENDER_CLASS ||
>> +		       ((engine->class == VIDEO_DECODE_CLASS ||
>> +			 engine->class == VIDEO_ENHANCEMENT_CLASS) &&
>> +			engine->gt->type == GT_MEDIA);
>>	default:
>>		return engine->class == RENDER_CLASS;
>>	}
>
>As mentioned in a previous patch, this could just be:
>
>	return engine->oa_group;
>
>Because all these checks have already been done when the perf groups were
>initialized so let's use that, as is done for oa_unit_functional.
>
>Though, caution, to return engine->oa_group we'd have to remove
>engine_supports_oa from __oa_engine_group, since engine->oa_group is not
>yet assigned there. But I think the engine_supports_oa check in
>__oa_engine_group is a duplication and should be removed.
>
>>  }
>>
>> +static bool engine_class_supports_oa_format(struct intel_engine_cs *engine, int type)
>> +{
>> +	switch (engine->class) {
>> +	case RENDER_CLASS:
>> +		return type == TYPE_OAG;
>> +	case VIDEO_DECODE_CLASS:
>> +	case VIDEO_ENHANCEMENT_CLASS:
>> +		return type == TYPE_OAM;
>> +	default:
>> +		return false;
>> +	}
>> +}
>> +
>
>Again, how about:
>
>	return engine->oa_group && engine->oa_group->type == type;
>
>Otherwise as mentioned below oa_group->type is unused and also incorrectly
>assigned at present. The format type and group types are the same
>(TYPE_OAG/TYPE_OAM). Can name the function engine_supports_oa_format.
>
>>  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>>  {
>>	struct i915_perf *perf = stream->perf;
>> @@ -1680,7 +1726,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>>		drm_WARN_ON(&gt->i915->drm,
>>			    intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc));
>>
>> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
>> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
>>	intel_engine_pm_put(stream->engine);
>>
>>	if (stream->ctx)
>> @@ -1804,8 +1850,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>>
>>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>>
>> -	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
>> -	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
>>			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
>>	stream->oa_buffer.head = gtt_offset;
>>
>> @@ -1817,9 +1863,9 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>>	 *  to enable proper functionality of the overflow
>>	 *  bit."
>>	 */
>> -	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
>>			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>> -	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
>>			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>>
>>	/* Mark that we need updated tail pointers to read from... */
>> @@ -2579,7 +2625,8 @@ gen8_modify_self(struct intel_context *ce,
>>	return err;
>>  }
>>
>> -static int gen8_configure_context(struct i915_gem_context *ctx,
>> +static int gen8_configure_context(struct i915_perf_stream *stream,
>> +				  struct i915_gem_context *ctx,
>>				  struct flex *flex, unsigned int count)
>>  {
>>	struct i915_gem_engines_iter it;
>> @@ -2589,7 +2636,8 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
>>	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
>>		GEM_BUG_ON(ce == ce->engine->kernel_context);
>>
>> -		if (!engine_supports_oa(ce->engine))
>> +		if (!engine_supports_oa(ce->engine) ||
>> +		    ce->engine->class != stream->engine->class)
>>			continue;
>>
>>		/* Otherwise OA settings will be set upon first use */
>> @@ -2720,7 +2768,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
>>
>>		spin_unlock(&i915->gem.contexts.lock);
>>
>> -		err = gen8_configure_context(ctx, regs, num_regs);
>> +		err = gen8_configure_context(stream, ctx, regs, num_regs);
>>		if (err) {
>>			i915_gem_context_put(ctx);
>>			return err;
>> @@ -2740,7 +2788,8 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
>>	for_each_uabi_engine(engine, i915) {
>>		struct intel_context *ce = engine->kernel_context;
>>
>> -		if (!engine_supports_oa(ce->engine))
>> +		if (!engine_supports_oa(ce->engine) ||
>> +		    ce->engine->class != stream->engine->class)
>>			continue;
>>
>>		regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu);
>> @@ -2765,6 +2814,9 @@ gen12_configure_all_contexts(struct i915_perf_stream *stream,
>>		},
>>	};
>>
>> +	if (stream->engine->class != RENDER_CLASS)
>> +		return 0;
>
>OK, this is for render, nothing equivalent needed for media?

Media engines decided not to have anything configured in the CS 
contexts, rather everything is saved/restored in power context 
transitions, so nothing to be done here.

>
>> +
>>	return oa_configure_all_contexts(stream,
>>					 regs, ARRAY_SIZE(regs),
>>					 active);
>> @@ -2894,7 +2946,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
>>				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
>>	}
>>
>> -	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
>>			   /* Disable clk ratio reports, like previous Gens. */
>>			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
>>					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
>> @@ -2904,7 +2956,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
>>			    */
>>			   oag_report_ctx_switches(stream));
>>
>> -	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
>>			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
>>			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
>>			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
>> @@ -3058,8 +3110,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
>>
>>  static void gen12_oa_enable(struct i915_perf_stream *stream)
>>  {
>> -	struct intel_uncore *uncore = stream->uncore;
>> -	u32 report_format = stream->oa_buffer.format->format;
>> +	const struct i915_perf_regs *regs;
>> +	u32 val;
>>
>>	/*
>>	 * If we don't want OA reports from the OA buffer, then we don't even
>> @@ -3070,9 +3122,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream)
>>
>>	gen12_init_oa_buffer(stream);
>>
>> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
>> -			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
>> -			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
>> +	regs = __oa_regs(stream);
>> +	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
>> +	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
>> +
>> +	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
>>  }
>>
>>  /**
>> @@ -3124,9 +3178,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream)
>>  {
>>	struct intel_uncore *uncore = stream->uncore;
>>
>> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
>> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
>>	if (intel_wait_for_register(uncore,
>> -				    GEN12_OAG_OACONTROL,
>> +				    __oa_regs(stream)->oa_ctrl,
>>				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
>>				    50))
>>		drm_err(&stream->perf->i915->drm,
>> @@ -3329,6 +3383,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>
>>	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
>>
>> +	stream->oa_buffer.group = g;
>
>Should just use stream->engine->oa_group, see near the bottom.
>
>>	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
>>	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0))
>>		return -EINVAL;
>> @@ -3379,7 +3434,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>	 *   references will effectively disable RC6.
>>	 */
>>	intel_engine_pm_get(stream->engine);
>> -	intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
>> +	intel_uncore_forcewake_get(stream->uncore, g->fw_domains);
>>
>>	/*
>>	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
>> @@ -3440,7 +3495,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>		intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc);
>>
>>  err_gucrc:
>> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
>> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
>>	intel_engine_pm_put(stream->engine);
>>
>>	free_oa_configs(stream);
>> @@ -4033,6 +4088,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
>>				    struct perf_open_properties *props)
>>  {
>>	struct drm_i915_gem_context_param_sseu user_sseu;
>> +	const struct i915_oa_format *f;
>>	u64 __user *uprop = uprops;
>>	bool config_sseu = false;
>>	u8 class, instance;
>> @@ -4203,6 +4259,17 @@ static int read_properties_unlocked(struct i915_perf *perf,
>>	if (!engine_supports_oa(props->engine))
>>		return -EINVAL;
>>
>> +	if (!oa_unit_functional(props->engine))
>> +		return -ENODEV;
>> +
>> +	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
>
>Why do we need this (something to do with speculation)? Can just do
>'&perf->oa_formats[props->oa_format]' below? The format passed in has
>already been checked in the switch statement above.

Traced it back to "smatch cleanups" commit in rebase history. Something 
to do with static analysis. If not a major concern, I would leave it as 
is.

>
>> +	f = &perf->oa_formats[i];
>> +	if (!engine_class_supports_oa_format(props->engine, f->type)) {
>> +		DRM_DEBUG("Invalid OA format %d for class %d\n",
>> +			  f->type, props->engine->class);
>> +		return -EINVAL;
>> +	}
>> +
>>	if (config_sseu) {
>>		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
>>		if (ret) {
>> @@ -4383,6 +4450,14 @@ static const struct i915_range gen12_oa_b_counters[] = {
>>	{}
>>  };
>>
>> +static const struct i915_range mtl_oam_b_counters[] = {
>> +	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
>> +	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
>> +	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
>> +	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
>> +	{}
>> +};
>> +
>>  static const struct i915_range xehp_oa_b_counters[] = {
>>	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
>>	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
>> @@ -4429,13 +4504,16 @@ static const struct i915_range gen12_oa_mux_regs[] = {
>>
>>  /*
>>   * Ref: 14010536224:
>> - * 0x20cc is repurposed on MTL, so use a separate array for MTL.
>> + * 0x20cc is repurposed on MTL, so use a separate array for MTL. Also add the
>> + * MPES/MPEC registers.
>
>MPES/MPEC registers are added above now, not here so maybe get rid of the
>comment change above?
>
>>   */
>>  static const struct i915_range mtl_oa_mux_regs[] = {
>>	{ .start = 0x0d00, .end = 0x0d04 },	/* RPM_CONFIG[0-1] */
>>	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
>>	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
>>	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
>> +	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
>> +	{}
>>  };
>>
>>  static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>> @@ -4473,10 +4551,26 @@ static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>>	return reg_in_range_table(addr, gen12_oa_b_counters);
>>  }
>>
>> +static bool xehp_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
>> +{
>> +	enum intel_platform platform = INTEL_INFO(perf->i915)->platform;
>> +
>> +	if (!HAS_OAM(perf->i915))
>> +		return false;
>> +
>> +	switch (platform) {
>> +	case INTEL_METEORLAKE:
>> +		return reg_in_range_table(addr, mtl_oam_b_counters);
>> +	default:
>> +		return false;
>> +	}
>
>Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', registers
>are identical in later platforms too.
>
>Should the function prefix be xehp or mtl? Don't see xehp in bspec,
>probably xehp is discontinued.
>
>> +}
>> +
>>  static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
>>  {
>>	return reg_in_range_table(addr, xehp_oa_b_counters) ||
>> -		reg_in_range_table(addr, gen12_oa_b_counters);
>> +		reg_in_range_table(addr, gen12_oa_b_counters) ||
>> +		xehp_is_valid_oam_b_counter_addr(perf, addr);
>>  }
>>
>>  static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
>> @@ -4846,11 +4940,39 @@ static u32 __num_perf_groups_per_gt(struct intel_gt *gt)
>>	enum intel_platform platform = INTEL_INFO(gt->i915)->platform;
>>
>>	switch (platform) {
>> +	case INTEL_METEORLAKE:
>> +		return 1;
>
>I don't think we need this, as proposed previously maybe the function
>should just unconditionally return 1.
>
>>	default:
>>		return 1;
>>	}
>>  }
>>
>> +static u32 __oam_engine_group(struct intel_engine_cs *engine)
>> +{
>> +	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
>> +	struct intel_gt *gt = engine->gt;
>> +	u32 group = PERF_GROUP_INVALID;
>> +
>> +	switch (platform) {
>> +	case INTEL_METEORLAKE:
>
>Replace here with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))'.
>
>> +		/*
>> +		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
>> +		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
>> +		 */
>> +		drm_WARN_ON(&engine->i915->drm,
>> +			    engine->gt->type != GT_MEDIA);
>> +
>> +		group = PERF_GROUP_OAM_SAMEDIA_0;
>> +		break;
>> +	default:
>> +		break;
>> +	}
>> +
>> +	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));
>> +
>> +	return group;
>> +}
>> +
>>  static u32 __oa_engine_group(struct intel_engine_cs *engine)
>>  {
>>	if (!engine_supports_oa(engine))
>
>As mentioned above for engine_supports_oa, this looks like a duplication of
>the checks below and should probably be removed.

I recall I added this because on older platforms, we were seeing a 
warnON splat from __oam_engine_group because those platforms had a media 
engine. Ideally, I would implement all your inputs in this patch and 
drop the warnON from __oam_engine_group. Is that okay?

>
>> @@ -4860,11 +4982,58 @@ static u32 __oa_engine_group(struct intel_engine_cs *engine)
>>	case RENDER_CLASS:
>>		return PERF_GROUP_OAG;
>>
>> +	case VIDEO_DECODE_CLASS:
>> +	case VIDEO_ENHANCEMENT_CLASS:
>> +		return __oam_engine_group(engine);
>> +
>>	default:
>>		return PERF_GROUP_INVALID;
>>	}
>>  }
>>
>> +static struct i915_perf_regs __oam_regs(u32 base)
>> +{
>> +	return (struct i915_perf_regs) {
>> +		base,
>> +		GEN12_OAM_HEAD_POINTER(base),
>> +		GEN12_OAM_TAIL_POINTER(base),
>> +		GEN12_OAM_BUFFER(base),
>> +		GEN12_OAM_CONTEXT_CONTROL(base),
>> +		GEN12_OAM_CONTROL(base),
>> +		GEN12_OAM_DEBUG(base),
>> +		GEN12_OAM_STATUS(base),
>> +		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
>> +	};
>> +}
>> +
>> +static struct i915_perf_regs __oag_regs(void)
>> +{
>> +	return (struct i915_perf_regs) {
>> +		0,
>> +		GEN12_OAG_OAHEADPTR,
>> +		GEN12_OAG_OATAILPTR,
>> +		GEN12_OAG_OABUFFER,
>> +		GEN12_OAG_OAGLBCTXCTRL,
>> +		GEN12_OAG_OACONTROL,
>> +		GEN12_OAG_OA_DEBUG,
>> +		GEN12_OAG_OASTATUS,
>> +		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
>> +	};
>> +}
>> +
>> +static void oa_init_regs(struct intel_gt *gt, u32 id)
>> +{
>> +	struct i915_perf_group *group = &gt->perf.group[id];
>> +	struct i915_perf_regs *regs = &group->regs;
>> +
>> +	if (id == PERF_GROUP_OAG && gt->type != GT_MEDIA)
>> +		*regs = __oag_regs();
>> +	else if (IS_METEORLAKE(gt->i915))
>
>Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', OAM
>registers are identical in later platforms too. Maybe get rid of drm_WARN
>below?
>
>> +		*regs = __oam_regs(mtl_oa_base[id]);
>> +	else
>> +		drm_WARN(&gt->i915->drm, 1, "Unsupported platform for OA\n");
>> +}
>> +
>>  static void oa_init_groups(struct intel_gt *gt)
>>  {
>>	int i, num_groups = gt->perf.num_perf_groups;
>> @@ -4881,6 +5050,24 @@ static void oa_init_groups(struct intel_gt *gt)
>>		g->oa_unit_id = perf->oa_unit_ids++;
>>
>>		g->gt = gt;
>> +		oa_init_regs(gt, i);
>> +		g->fw_domains = FORCEWAKE_ALL;
>> +		if (i == PERF_GROUP_OAG) {
>> +			g->type = TYPE_OAG;
>> +
>> +			/*
>> +			 * Enabling all fw domains for OAG caps the max GT
>> +			 * frequency to media FF max. This could be less than
>> +			 * what the user sets through the sysfs and perf
>> +			 * measurements could be skewed. Since some platforms
>> +			 * have separate OAM units to measure media perf, do not
>> +			 * enable media fw domains for OAG.
>> +			 */
>> +			if (HAS_OAM(gt->i915))
>> +				g->fw_domains = FORCEWAKE_GT | FORCEWAKE_RENDER;
>
>Is this needed even when media and render are separate tiles, which is the
>only case we have in this code right now? For separate tiles setting
>FORCEWAKE_ALL should not cap the freq, correct?
>
>If not needed we can get rid of g->fw_domains.
>
>> +		} else {
>> +			g->type = TYPE_OAM;
>
>This is wrong, because num_perf_groups is 1. So the type should be assigned
>not based on i (which is always 0) but maybe similar to what is done in
>oa_init_regs above.

That's a bug. Will fix. It was working as expected for some other 
platforms that had OAM, but with PERF_GROUP_OAM_SAMEDIA_0, it broke.

Thanks,
Umesh
Ashutosh Dixit Feb. 25, 2023, 3:58 a.m. UTC | #4
On Fri, 24 Feb 2023 16:58:39 -0800, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> On Thu, Feb 23, 2023 at 12:05:02PM -0800, Dixit, Ashutosh wrote:
> > On Thu, 16 Feb 2023 16:58:50 -0800, Umesh Nerlige Ramappa wrote:
> >>
> >
> > Hi Umesh,
> >
> >> MTL introduces additional OA units dedicated to media use cases. Add
> >> support for programming these OA units by passing the media engine class
> >> and instance parameters.
> >>
> >> UMD specific changes for GPUvis support:
> >> https://patchwork.freedesktop.org/patch/522827/?series=114023
> >> https://patchwork.freedesktop.org/patch/522822/?series=114023
> >> https://patchwork.freedesktop.org/patch/522826/?series=114023
> >> https://patchwork.freedesktop.org/patch/522828/?series=114023
> >> https://patchwork.freedesktop.org/patch/522816/?series=114023
> >> https://patchwork.freedesktop.org/patch/522825/?series=114023
> >
> > General comment about the patch in case I miss something out, as I've
> > mentioned previously in general let's try to replace INTEL_METEORLAKE and
> > IS_METEORLAKE checks in the patch with:
> >
> >	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
> >
> > So that we don't have to enumerate each platform individually later.
>
> Hmm, I recall that you had already commented about this at some point,
> sorry I missed that. Do you suggest I add this change in places outside
> this patch as well?

Not needed unless you want to add another patch. At least in this patch
let's not start doing this in places where it did not exist previously.

> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_drv.h          |   2 +
> >>  drivers/gpu/drm/i915/i915_pci.c          |   1 +
> >>  drivers/gpu/drm/i915/i915_perf.c         | 247 ++++++++++++++++++++---
> >>  drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++
> >>  drivers/gpu/drm/i915/i915_perf_types.h   |  40 ++++
> >>  drivers/gpu/drm/i915/intel_device_info.h |   1 +
> >>  include/uapi/drm/i915_drm.h              |   4 +
> >>  7 files changed, 347 insertions(+), 26 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >> index 0393273faa09..f3cacbf41c86 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.h
> >> +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> @@ -856,6 +856,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
> >>	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
> >>  #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
> >>	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
> >> +#define HAS_OAM(dev_priv) \
> >> +	(INTEL_INFO(dev_priv)->has_oam)
> >>
> >>  /*
> >>   * Set this flag, when platform requires 64K GTT page sizes or larger for
> >> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> >> index a8d942b16223..621730b6551c 100644
> >> --- a/drivers/gpu/drm/i915/i915_pci.c
> >> +++ b/drivers/gpu/drm/i915/i915_pci.c
> >> @@ -1028,6 +1028,7 @@ static const struct intel_device_info adl_p_info = {
> >>	.has_mslice_steering = 1, \
> >>	.has_oa_bpc_reporting = 1, \
> >>	.has_oa_slice_contrib_limits = 1, \
> >> +	.has_oam = 1, \
> >>	.has_rc6 = 1, \
> >>	.has_reset_engine = 1, \
> >>	.has_rps = 1, \
> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> >> index f028df812067..a57690f4c531 100644
> >> --- a/drivers/gpu/drm/i915/i915_perf.c
> >> +++ b/drivers/gpu/drm/i915/i915_perf.c
> >> @@ -192,6 +192,7 @@
> >>   */
> >>
> >>  #include <linux/anon_inodes.h>
> >> +#include <linux/nospec.h>
> >>  #include <linux/sizes.h>
> >>  #include <linux/uuid.h>
> >>
> >> @@ -326,6 +327,13 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
> >>	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
> >>	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
> >>	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
> >> +	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
> >> +	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
> >> +};
> >> +
> >> +/* PERF_GROUP_OAG is unused for oa_base, drop it for mtl */
> >
> > What does this comment mean?
>
> There are multiple OAM units and the base for each is used to calculate the
> OA regs mmio address. OAG is just one unit with the same addresses for the
> regs, so we don't use this array that initializes the bases for OA
> units. Maybe I will add this in the comment here.

You mean base is 0 in __oag_regs() correct? And index 0 corresponds to
PERF_GROUP_OAM_SAMEDIA_0 not to PERF_GROUP_OAG? Either drop the comment,
the code is clear enough I think, or make it clearer.

>
> >
> >> +static const u32 mtl_oa_base[] = {
> >> +	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
> >>  };
> >>
> >>  #define SAMPLE_OA_REPORT      (1<<0)
> >> @@ -418,11 +426,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
> >>	kfree(oa_bo);
> >>  }
> >>
> >> +static inline const
> >> +struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
> >> +{
> >> +	return &stream->oa_buffer.group->regs;
> >
> > Should just use stream->engine->oa_group->regs, see near the bottom.
> >
> >> +}
> >> +
> >>  static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
> >>  {
> >>	struct intel_uncore *uncore = stream->uncore;
> >>
> >> -	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
> >> +	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
> >>	       GEN12_OAG_OATAILPTR_MASK;
> >>  }
> >>
> >> @@ -886,7 +900,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> >>		i915_reg_t oaheadptr;
> >>
> >>		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?
> >
> >> = 12 ?
> >
> >> -			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
> >> +			    __oa_regs(stream)->oa_head_ptr :
> >> +			    GEN8_OAHEADPTR;
> >>
> >>		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
> >>
> >> @@ -939,7 +954,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
> >>		return -EIO;
> >>
> >>	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?
> >
> >> = 12 ?
> >
> >> -		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
> >> +		       __oa_regs(stream)->oa_status :
> >> +		       GEN8_OASTATUS;
> >>
> >>	oastatus = intel_uncore_read(uncore, oastatus_reg);
> >>
> >> @@ -1643,16 +1659,46 @@ free_noa_wait(struct i915_perf_stream *stream)
> >>	i915_vma_unpin_and_release(&stream->noa_wait, 0);
> >>  }
> >>
> >> +/*
> >> + * intel_engine_lookup_user ensures that most of engine specific checks are
> >> + * taken care of, however, we can run into a case where the OA unit catering to
> >> + * the engine passed by the user is disabled for some reason. In such cases,
> >> + * ensure oa unit corresponding to an engine is functional. If there are no
> >> + * engines in the group, the unit is disabled.
> >> + */
> >> +static bool oa_unit_functional(const struct intel_engine_cs *engine)
> >> +{
> >> +	return engine->oa_group && engine->oa_group->num_engines;
> >> +}
> >> +
> >>  static bool engine_supports_oa(const struct intel_engine_cs *engine)
> >>  {
> >>	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
> >>
> >>	switch (platform) {
> >> +	case INTEL_METEORLAKE:
> >> +		return engine->class == RENDER_CLASS ||
> >> +		       ((engine->class == VIDEO_DECODE_CLASS ||
> >> +			 engine->class == VIDEO_ENHANCEMENT_CLASS) &&
> >> +			engine->gt->type == GT_MEDIA);
> >>	default:
> >>		return engine->class == RENDER_CLASS;
> >>	}
> >
> > As mentioned in a previous patch, this could just be:
> >
> >	return engine->oa_group;
> >
> > Because all these checks have already been done when the perf groups were
> > initialized so let's use that, as is done for oa_unit_functional.
> >
> > Though, caution, to return engine->oa_group we'd have to remove
> > engine_supports_oa from __oa_engine_group, since engine->oa_group is not
> > yet assigned there. But I think the engine_supports_oa check in
> > __oa_engine_group is a duplication and should be removed.
> >
> >>  }
> >>
> >> +static bool engine_class_supports_oa_format(struct intel_engine_cs *engine, int type)
> >> +{
> >> +	switch (engine->class) {
> >> +	case RENDER_CLASS:
> >> +		return type == TYPE_OAG;
> >> +	case VIDEO_DECODE_CLASS:
> >> +	case VIDEO_ENHANCEMENT_CLASS:
> >> +		return type == TYPE_OAM;
> >> +	default:
> >> +		return false;
> >> +	}
> >> +}
> >> +
> >
> > Again, how about:
> >
> >	return engine->oa_group && engine->oa_group->type == type;
> >
> > Otherwise as mentioned below oa_group->type is unused and also incorrectly
> > assigned at present. The format type and group types are the same
> > (TYPE_OAG/TYPE_OAM). Can name the function engine_supports_oa_format.
> >
> >>  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> >>  {
> >>	struct i915_perf *perf = stream->perf;
> >> @@ -1680,7 +1726,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> >>		drm_WARN_ON(&gt->i915->drm,
> >>			    intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc));
> >>
> >> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
> >> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
> >>	intel_engine_pm_put(stream->engine);
> >>
> >>	if (stream->ctx)
> >> @@ -1804,8 +1850,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> >>
> >>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
> >>
> >> -	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
> >> -	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
> >>			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
> >>	stream->oa_buffer.head = gtt_offset;
> >>
> >> @@ -1817,9 +1863,9 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> >>	 *  to enable proper functionality of the overflow
> >>	 *  bit."
> >>	 */
> >> -	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
> >>			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
> >> -	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
> >>			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
> >>
> >>	/* Mark that we need updated tail pointers to read from... */
> >> @@ -2579,7 +2625,8 @@ gen8_modify_self(struct intel_context *ce,
> >>	return err;
> >>  }
> >>
> >> -static int gen8_configure_context(struct i915_gem_context *ctx,
> >> +static int gen8_configure_context(struct i915_perf_stream *stream,
> >> +				  struct i915_gem_context *ctx,
> >>				  struct flex *flex, unsigned int count)
> >>  {
> >>	struct i915_gem_engines_iter it;
> >> @@ -2589,7 +2636,8 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
> >>	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
> >>		GEM_BUG_ON(ce == ce->engine->kernel_context);
> >>
> >> -		if (!engine_supports_oa(ce->engine))
> >> +		if (!engine_supports_oa(ce->engine) ||
> >> +		    ce->engine->class != stream->engine->class)
> >>			continue;
> >>
> >>		/* Otherwise OA settings will be set upon first use */
> >> @@ -2720,7 +2768,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
> >>
> >>		spin_unlock(&i915->gem.contexts.lock);
> >>
> >> -		err = gen8_configure_context(ctx, regs, num_regs);
> >> +		err = gen8_configure_context(stream, ctx, regs, num_regs);
> >>		if (err) {
> >>			i915_gem_context_put(ctx);
> >>			return err;
> >> @@ -2740,7 +2788,8 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
> >>	for_each_uabi_engine(engine, i915) {
> >>		struct intel_context *ce = engine->kernel_context;
> >>
> >> -		if (!engine_supports_oa(ce->engine))
> >> +		if (!engine_supports_oa(ce->engine) ||
> >> +		    ce->engine->class != stream->engine->class)
> >>			continue;
> >>
> >>		regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu);
> >> @@ -2765,6 +2814,9 @@ gen12_configure_all_contexts(struct i915_perf_stream *stream,
> >>		},
> >>	};
> >>
> >> +	if (stream->engine->class != RENDER_CLASS)
> >> +		return 0;
> >
> > OK, this is for render, nothing equivalent needed for media?
>
> Media engines decided not to have anything configured in the CS contexts,
> rather everything is saved/restored in power context transitions, so
> nothing to be done here.

Great!

>
> >
> >> +
> >>	return oa_configure_all_contexts(stream,
> >>					 regs, ARRAY_SIZE(regs),
> >>					 active);
> >> @@ -2894,7 +2946,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> >>				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
> >>	}
> >>
> >> -	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
> >>			   /* Disable clk ratio reports, like previous Gens. */
> >>			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
> >>					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
> >> @@ -2904,7 +2956,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> >>			    */
> >>			   oag_report_ctx_switches(stream));
> >>
> >> -	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
> >>			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
> >>			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
> >>			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
> >> @@ -3058,8 +3110,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
> >>
> >>  static void gen12_oa_enable(struct i915_perf_stream *stream)
> >>  {
> >> -	struct intel_uncore *uncore = stream->uncore;
> >> -	u32 report_format = stream->oa_buffer.format->format;
> >> +	const struct i915_perf_regs *regs;
> >> +	u32 val;
> >>
> >>	/*
> >>	 * If we don't want OA reports from the OA buffer, then we don't even
> >> @@ -3070,9 +3122,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream)
> >>
> >>	gen12_init_oa_buffer(stream);
> >>
> >> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
> >> -			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
> >> -			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
> >> +	regs = __oa_regs(stream);
> >> +	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
> >> +	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
> >> +
> >> +	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
> >>  }
> >>
> >>  /**
> >> @@ -3124,9 +3178,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream)
> >>  {
> >>	struct intel_uncore *uncore = stream->uncore;
> >>
> >> -	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
> >> +	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
> >>	if (intel_wait_for_register(uncore,
> >> -				    GEN12_OAG_OACONTROL,
> >> +				    __oa_regs(stream)->oa_ctrl,
> >>				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
> >>				    50))
> >>		drm_err(&stream->perf->i915->drm,
> >> @@ -3329,6 +3383,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> >>
> >>	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
> >>
> >> +	stream->oa_buffer.group = g;
> >
> > Should just use stream->engine->oa_group, see near the bottom.
> >
> >>	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
> >>	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0))
> >>		return -EINVAL;
> >> @@ -3379,7 +3434,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> >>	 *   references will effectively disable RC6.
> >>	 */
> >>	intel_engine_pm_get(stream->engine);
> >> -	intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
> >> +	intel_uncore_forcewake_get(stream->uncore, g->fw_domains);
> >>
> >>	/*
> >>	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
> >> @@ -3440,7 +3495,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> >>		intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc);
> >>
> >>  err_gucrc:
> >> -	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
> >> +	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
> >>	intel_engine_pm_put(stream->engine);
> >>
> >>	free_oa_configs(stream);
> >> @@ -4033,6 +4088,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
> >>				    struct perf_open_properties *props)
> >>  {
> >>	struct drm_i915_gem_context_param_sseu user_sseu;
> >> +	const struct i915_oa_format *f;
> >>	u64 __user *uprop = uprops;
> >>	bool config_sseu = false;
> >>	u8 class, instance;
> >> @@ -4203,6 +4259,17 @@ static int read_properties_unlocked(struct i915_perf *perf,
> >>	if (!engine_supports_oa(props->engine))
> >>		return -EINVAL;
> >>
> >> +	if (!oa_unit_functional(props->engine))
> >> +		return -ENODEV;
> >> +
> >> +	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
> >
> > Why do we need this (something to do with speculation)? Can just do
> > '&perf->oa_formats[props->oa_format]' below? The format passed in has
> > already been checked in the switch statement above.
>
> Traced it back to "smatch cleanups" commit in rebase history. Something to
> do with static analysis. If not a major concern, I would leave it as is.

OK, though that internal commit doesn't show the warning for i915_perf.c :/

Though just found this online: https://lwn.net/Articles/752408/

>
> >
> >> +	f = &perf->oa_formats[i];
> >> +	if (!engine_class_supports_oa_format(props->engine, f->type)) {
> >> +		DRM_DEBUG("Invalid OA format %d for class %d\n",
> >> +			  f->type, props->engine->class);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >>	if (config_sseu) {
> >>		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
> >>		if (ret) {
> >> @@ -4383,6 +4450,14 @@ static const struct i915_range gen12_oa_b_counters[] = {
> >>	{}
> >>  };
> >>
> >> +static const struct i915_range mtl_oam_b_counters[] = {
> >> +	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
> >> +	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
> >> +	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
> >> +	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
> >> +	{}
> >> +};
> >> +
> >>  static const struct i915_range xehp_oa_b_counters[] = {
> >>	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
> >>	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
> >> @@ -4429,13 +4504,16 @@ static const struct i915_range gen12_oa_mux_regs[] = {
> >>
> >>  /*
> >>   * Ref: 14010536224:
> >> - * 0x20cc is repurposed on MTL, so use a separate array for MTL.
> >> + * 0x20cc is repurposed on MTL, so use a separate array for MTL. Also add the
> >> + * MPES/MPEC registers.
> >
> > MPES/MPEC registers are added above now, not here so maybe get rid of the
> > comment change above?
> >
> >>   */
> >>  static const struct i915_range mtl_oa_mux_regs[] = {
> >>	{ .start = 0x0d00, .end = 0x0d04 },	/* RPM_CONFIG[0-1] */
> >>	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
> >>	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
> >>	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
> >> +	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
> >> +	{}
> >>  };
> >>
> >>  static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> >> @@ -4473,10 +4551,26 @@ static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> >>	return reg_in_range_table(addr, gen12_oa_b_counters);
> >>  }
> >>
> >> +static bool xehp_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
> >> +{
> >> +	enum intel_platform platform = INTEL_INFO(perf->i915)->platform;
> >> +
> >> +	if (!HAS_OAM(perf->i915))
> >> +		return false;
> >> +
> >> +	switch (platform) {
> >> +	case INTEL_METEORLAKE:
> >> +		return reg_in_range_table(addr, mtl_oam_b_counters);
> >> +	default:
> >> +		return false;
> >> +	}
> >
> > Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', registers
> > are identical in later platforms too.
> >
> > Should the function prefix be xehp or mtl? Don't see xehp in bspec,
> > probably xehp is discontinued.
> >
> >> +}
> >> +
> >>  static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
> >>  {
> >>	return reg_in_range_table(addr, xehp_oa_b_counters) ||
> >> -		reg_in_range_table(addr, gen12_oa_b_counters);
> >> +		reg_in_range_table(addr, gen12_oa_b_counters) ||
> >> +		xehp_is_valid_oam_b_counter_addr(perf, addr);
> >>  }
> >>
> >>  static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
> >> @@ -4846,11 +4940,39 @@ static u32 __num_perf_groups_per_gt(struct intel_gt *gt)
> >>	enum intel_platform platform = INTEL_INFO(gt->i915)->platform;
> >>
> >>	switch (platform) {
> >> +	case INTEL_METEORLAKE:
> >> +		return 1;
> >
> > I don't think we need this, as proposed previously maybe the function
> > should just unconditionally return 1.
> >
> >>	default:
> >>		return 1;
> >>	}
> >>  }
> >>
> >> +static u32 __oam_engine_group(struct intel_engine_cs *engine)
> >> +{
> >> +	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
> >> +	struct intel_gt *gt = engine->gt;
> >> +	u32 group = PERF_GROUP_INVALID;
> >> +
> >> +	switch (platform) {
> >> +	case INTEL_METEORLAKE:
> >
> > Replace here with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))'.
> >
> >> +		/*
> >> +		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
> >> +		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
> >> +		 */
> >> +		drm_WARN_ON(&engine->i915->drm,
> >> +			    engine->gt->type != GT_MEDIA);
> >> +
> >> +		group = PERF_GROUP_OAM_SAMEDIA_0;
> >> +		break;
> >> +	default:
> >> +		break;
> >> +	}
> >> +
> >> +	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));
> >> +
> >> +	return group;
> >> +}
> >> +
> >>  static u32 __oa_engine_group(struct intel_engine_cs *engine)
> >>  {
> >>	if (!engine_supports_oa(engine))
> >
> > As mentioned above for engine_supports_oa, this looks like a duplication of
> > the checks below and should probably be removed.
>
> I recall I added this because on older platforms, we were seeing a warnON
> splat from __oam_engine_group because those platforms had a media
> engine. Ideally, I would implement all your inputs in this patch and drop
> the warnON from __oam_engine_group. Is that okay?

OK you are referring to this warn_on:

	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));

Since we are supporting OAM for >= 12.70 we can put the warn_on under the
>= 12.70 check, but if it looks weird because we have just set the group to
PERF_GROUP_OAM_SAMEDIA_0, we could just get rid of the warn_on.

>
> >
> >> @@ -4860,11 +4982,58 @@ static u32 __oa_engine_group(struct intel_engine_cs *engine)
> >>	case RENDER_CLASS:
> >>		return PERF_GROUP_OAG;
> >>
> >> +	case VIDEO_DECODE_CLASS:
> >> +	case VIDEO_ENHANCEMENT_CLASS:
> >> +		return __oam_engine_group(engine);
> >> +
> >>	default:
> >>		return PERF_GROUP_INVALID;
> >>	}
> >>  }
> >>
> >> +static struct i915_perf_regs __oam_regs(u32 base)
> >> +{
> >> +	return (struct i915_perf_regs) {
> >> +		base,
> >> +		GEN12_OAM_HEAD_POINTER(base),
> >> +		GEN12_OAM_TAIL_POINTER(base),
> >> +		GEN12_OAM_BUFFER(base),
> >> +		GEN12_OAM_CONTEXT_CONTROL(base),
> >> +		GEN12_OAM_CONTROL(base),
> >> +		GEN12_OAM_DEBUG(base),
> >> +		GEN12_OAM_STATUS(base),
> >> +		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
> >> +	};
> >> +}
> >> +
> >> +static struct i915_perf_regs __oag_regs(void)
> >> +{
> >> +	return (struct i915_perf_regs) {
> >> +		0,
> >> +		GEN12_OAG_OAHEADPTR,
> >> +		GEN12_OAG_OATAILPTR,
> >> +		GEN12_OAG_OABUFFER,
> >> +		GEN12_OAG_OAGLBCTXCTRL,
> >> +		GEN12_OAG_OACONTROL,
> >> +		GEN12_OAG_OA_DEBUG,
> >> +		GEN12_OAG_OASTATUS,
> >> +		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
> >> +	};
> >> +}
> >> +
> >> +static void oa_init_regs(struct intel_gt *gt, u32 id)
> >> +{
> >> +	struct i915_perf_group *group = &gt->perf.group[id];
> >> +	struct i915_perf_regs *regs = &group->regs;
> >> +
> >> +	if (id == PERF_GROUP_OAG && gt->type != GT_MEDIA)
> >> +		*regs = __oag_regs();
> >> +	else if (IS_METEORLAKE(gt->i915))
> >
> > Replace with 'if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))', OAM
> > registers are identical in later platforms too. Maybe get rid of drm_WARN
> > below?
> >
> >> +		*regs = __oam_regs(mtl_oa_base[id]);
> >> +	else
> >> +		drm_WARN(&gt->i915->drm, 1, "Unsupported platform for OA\n");
> >> +}
> >> +
> >>  static void oa_init_groups(struct intel_gt *gt)
> >>  {
> >>	int i, num_groups = gt->perf.num_perf_groups;
> >> @@ -4881,6 +5050,24 @@ static void oa_init_groups(struct intel_gt *gt)
> >>		g->oa_unit_id = perf->oa_unit_ids++;
> >>
> >>		g->gt = gt;
> >> +		oa_init_regs(gt, i);
> >> +		g->fw_domains = FORCEWAKE_ALL;
> >> +		if (i == PERF_GROUP_OAG) {
> >> +			g->type = TYPE_OAG;
> >> +
> >> +			/*
> >> +			 * Enabling all fw domains for OAG caps the max GT
> >> +			 * frequency to media FF max. This could be less than
> >> +			 * what the user sets through the sysfs and perf
> >> +			 * measurements could be skewed. Since some platforms
> >> +			 * have separate OAM units to measure media perf, do not
> >> +			 * enable media fw domains for OAG.
> >> +			 */
> >> +			if (HAS_OAM(gt->i915))
> >> +				g->fw_domains = FORCEWAKE_GT | FORCEWAKE_RENDER;
> >
> > Is this needed even when media and render are separate tiles, which is the
> > only case we have in this code right now? For separate tiles setting
> > FORCEWAKE_ALL should not cap the freq, correct?
> >
> > If not needed we can get rid of g->fw_domains.
> >
> >> +		} else {
> >> +			g->type = TYPE_OAM;
> >
> > This is wrong, because num_perf_groups is 1. So the type should be assigned
> > not based on i (which is always 0) but maybe similar to what is done in
> > oa_init_regs above.
>
> That's a bug. Will fix. It was working as expected for some other platforms
> that had OAM, but with PERF_GROUP_OAM_SAMEDIA_0, it broke.

Thanks.
--
Ashutosh
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0393273faa09..f3cacbf41c86 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -856,6 +856,8 @@  IS_SUBPLATFORM(const struct drm_i915_private *i915,
 	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
 #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
 	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
+#define HAS_OAM(dev_priv) \
+	(INTEL_INFO(dev_priv)->has_oam)
 
 /*
  * Set this flag, when platform requires 64K GTT page sizes or larger for
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index a8d942b16223..621730b6551c 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -1028,6 +1028,7 @@  static const struct intel_device_info adl_p_info = {
 	.has_mslice_steering = 1, \
 	.has_oa_bpc_reporting = 1, \
 	.has_oa_slice_contrib_limits = 1, \
+	.has_oam = 1, \
 	.has_rc6 = 1, \
 	.has_reset_engine = 1, \
 	.has_rps = 1, \
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index f028df812067..a57690f4c531 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -192,6 +192,7 @@ 
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/nospec.h>
 #include <linux/sizes.h>
 #include <linux/uuid.h>
 
@@ -326,6 +327,13 @@  static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
 	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
+	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
+	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
+};
+
+/* PERF_GROUP_OAG is unused for oa_base, drop it for mtl */
+static const u32 mtl_oa_base[] = {
+	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
 };
 
 #define SAMPLE_OA_REPORT      (1<<0)
@@ -418,11 +426,17 @@  static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
 	kfree(oa_bo);
 }
 
+static inline const
+struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
+{
+	return &stream->oa_buffer.group->regs;
+}
+
 static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
 
-	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
+	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
 	       GEN12_OAG_OATAILPTR_MASK;
 }
 
@@ -886,7 +900,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		i915_reg_t oaheadptr;
 
 		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?
-			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
+			    __oa_regs(stream)->oa_head_ptr :
+			    GEN8_OAHEADPTR;
 
 		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
@@ -939,7 +954,8 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 		return -EIO;
 
 	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?
-		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
+		       __oa_regs(stream)->oa_status :
+		       GEN8_OASTATUS;
 
 	oastatus = intel_uncore_read(uncore, oastatus_reg);
 
@@ -1643,16 +1659,46 @@  free_noa_wait(struct i915_perf_stream *stream)
 	i915_vma_unpin_and_release(&stream->noa_wait, 0);
 }
 
+/*
+ * intel_engine_lookup_user ensures that most of engine specific checks are
+ * taken care of, however, we can run into a case where the OA unit catering to
+ * the engine passed by the user is disabled for some reason. In such cases,
+ * ensure oa unit corresponding to an engine is functional. If there are no
+ * engines in the group, the unit is disabled.
+ */
+static bool oa_unit_functional(const struct intel_engine_cs *engine)
+{
+	return engine->oa_group && engine->oa_group->num_engines;
+}
+
 static bool engine_supports_oa(const struct intel_engine_cs *engine)
 {
 	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
 
 	switch (platform) {
+	case INTEL_METEORLAKE:
+		return engine->class == RENDER_CLASS ||
+		       ((engine->class == VIDEO_DECODE_CLASS ||
+			 engine->class == VIDEO_ENHANCEMENT_CLASS) &&
+			engine->gt->type == GT_MEDIA);
 	default:
 		return engine->class == RENDER_CLASS;
 	}
 }
 
+static bool engine_class_supports_oa_format(struct intel_engine_cs *engine, int type)
+{
+	switch (engine->class) {
+	case RENDER_CLASS:
+		return type == TYPE_OAG;
+	case VIDEO_DECODE_CLASS:
+	case VIDEO_ENHANCEMENT_CLASS:
+		return type == TYPE_OAM;
+	default:
+		return false;
+	}
+}
+
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct i915_perf *perf = stream->perf;
@@ -1680,7 +1726,7 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 		drm_WARN_ON(&gt->i915->drm,
 			    intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc));
 
-	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
+	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
 	intel_engine_pm_put(stream->engine);
 
 	if (stream->ctx)
@@ -1804,8 +1850,8 @@  static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 
 	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
-	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
-	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
 			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
 	stream->oa_buffer.head = gtt_offset;
 
@@ -1817,9 +1863,9 @@  static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 	 *  to enable proper functionality of the overflow
 	 *  bit."
 	 */
-	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
 			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
-	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
 			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
 
 	/* Mark that we need updated tail pointers to read from... */
@@ -2579,7 +2625,8 @@  gen8_modify_self(struct intel_context *ce,
 	return err;
 }
 
-static int gen8_configure_context(struct i915_gem_context *ctx,
+static int gen8_configure_context(struct i915_perf_stream *stream,
+				  struct i915_gem_context *ctx,
 				  struct flex *flex, unsigned int count)
 {
 	struct i915_gem_engines_iter it;
@@ -2589,7 +2636,8 @@  static int gen8_configure_context(struct i915_gem_context *ctx,
 	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
 		GEM_BUG_ON(ce == ce->engine->kernel_context);
 
-		if (!engine_supports_oa(ce->engine))
+		if (!engine_supports_oa(ce->engine) ||
+		    ce->engine->class != stream->engine->class)
 			continue;
 
 		/* Otherwise OA settings will be set upon first use */
@@ -2720,7 +2768,7 @@  oa_configure_all_contexts(struct i915_perf_stream *stream,
 
 		spin_unlock(&i915->gem.contexts.lock);
 
-		err = gen8_configure_context(ctx, regs, num_regs);
+		err = gen8_configure_context(stream, ctx, regs, num_regs);
 		if (err) {
 			i915_gem_context_put(ctx);
 			return err;
@@ -2740,7 +2788,8 @@  oa_configure_all_contexts(struct i915_perf_stream *stream,
 	for_each_uabi_engine(engine, i915) {
 		struct intel_context *ce = engine->kernel_context;
 
-		if (!engine_supports_oa(ce->engine))
+		if (!engine_supports_oa(ce->engine) ||
+		    ce->engine->class != stream->engine->class)
 			continue;
 
 		regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu);
@@ -2765,6 +2814,9 @@  gen12_configure_all_contexts(struct i915_perf_stream *stream,
 		},
 	};
 
+	if (stream->engine->class != RENDER_CLASS)
+		return 0;
+
 	return oa_configure_all_contexts(stream,
 					 regs, ARRAY_SIZE(regs),
 					 active);
@@ -2894,7 +2946,7 @@  gen12_enable_metric_set(struct i915_perf_stream *stream,
 				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
 	}
 
-	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
 			   /* Disable clk ratio reports, like previous Gens. */
 			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
 					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
@@ -2904,7 +2956,7 @@  gen12_enable_metric_set(struct i915_perf_stream *stream,
 			    */
 			   oag_report_ctx_switches(stream));
 
-	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
 			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
 			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
 			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
@@ -3058,8 +3110,8 @@  static void gen8_oa_enable(struct i915_perf_stream *stream)
 
 static void gen12_oa_enable(struct i915_perf_stream *stream)
 {
-	struct intel_uncore *uncore = stream->uncore;
-	u32 report_format = stream->oa_buffer.format->format;
+	const struct i915_perf_regs *regs;
+	u32 val;
 
 	/*
 	 * If we don't want OA reports from the OA buffer, then we don't even
@@ -3070,9 +3122,11 @@  static void gen12_oa_enable(struct i915_perf_stream *stream)
 
 	gen12_init_oa_buffer(stream);
 
-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
-			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
-			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
+	regs = __oa_regs(stream);
+	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
+	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
+
+	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
 }
 
 /**
@@ -3124,9 +3178,9 @@  static void gen12_oa_disable(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
 
-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
 	if (intel_wait_for_register(uncore,
-				    GEN12_OAG_OACONTROL,
+				    __oa_regs(stream)->oa_ctrl,
 				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
 				    50))
 		drm_err(&stream->perf->i915->drm,
@@ -3329,6 +3383,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
+	stream->oa_buffer.group = g;
 	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
 	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0))
 		return -EINVAL;
@@ -3379,7 +3434,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	 *   references will effectively disable RC6.
 	 */
 	intel_engine_pm_get(stream->engine);
-	intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
+	intel_uncore_forcewake_get(stream->uncore, g->fw_domains);
 
 	/*
 	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
@@ -3440,7 +3495,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		intel_guc_slpc_unset_gucrc_mode(&gt->uc.guc.slpc);
 
 err_gucrc:
-	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
+	intel_uncore_forcewake_put(stream->uncore, g->fw_domains);
 	intel_engine_pm_put(stream->engine);
 
 	free_oa_configs(stream);
@@ -4033,6 +4088,7 @@  static int read_properties_unlocked(struct i915_perf *perf,
 				    struct perf_open_properties *props)
 {
 	struct drm_i915_gem_context_param_sseu user_sseu;
+	const struct i915_oa_format *f;
 	u64 __user *uprop = uprops;
 	bool config_sseu = false;
 	u8 class, instance;
@@ -4203,6 +4259,17 @@  static int read_properties_unlocked(struct i915_perf *perf,
 	if (!engine_supports_oa(props->engine))
 		return -EINVAL;
 
+	if (!oa_unit_functional(props->engine))
+		return -ENODEV;
+
+	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
+	f = &perf->oa_formats[i];
+	if (!engine_class_supports_oa_format(props->engine, f->type)) {
+		DRM_DEBUG("Invalid OA format %d for class %d\n",
+			  f->type, props->engine->class);
+		return -EINVAL;
+	}
+
 	if (config_sseu) {
 		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
 		if (ret) {
@@ -4383,6 +4450,14 @@  static const struct i915_range gen12_oa_b_counters[] = {
 	{}
 };
 
+static const struct i915_range mtl_oam_b_counters[] = {
+	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
+	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
+	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
+	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
+	{}
+};
+
 static const struct i915_range xehp_oa_b_counters[] = {
 	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
 	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
@@ -4429,13 +4504,16 @@  static const struct i915_range gen12_oa_mux_regs[] = {
 
 /*
  * Ref: 14010536224:
- * 0x20cc is repurposed on MTL, so use a separate array for MTL.
+ * 0x20cc is repurposed on MTL, so use a separate array for MTL. Also add the
+ * MPES/MPEC registers.
  */
 static const struct i915_range mtl_oa_mux_regs[] = {
 	{ .start = 0x0d00, .end = 0x0d04 },	/* RPM_CONFIG[0-1] */
 	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
 	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
 	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
+	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
+	{}
 };
 
 static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
@@ -4473,10 +4551,26 @@  static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 	return reg_in_range_table(addr, gen12_oa_b_counters);
 }
 
+static bool xehp_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
+{
+	enum intel_platform platform = INTEL_INFO(perf->i915)->platform;
+
+	if (!HAS_OAM(perf->i915))
+		return false;
+
+	switch (platform) {
+	case INTEL_METEORLAKE:
+		return reg_in_range_table(addr, mtl_oam_b_counters);
+	default:
+		return false;
+	}
+}
+
 static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 {
 	return reg_in_range_table(addr, xehp_oa_b_counters) ||
-		reg_in_range_table(addr, gen12_oa_b_counters);
+		reg_in_range_table(addr, gen12_oa_b_counters) ||
+		xehp_is_valid_oam_b_counter_addr(perf, addr);
 }
 
 static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
@@ -4846,11 +4940,39 @@  static u32 __num_perf_groups_per_gt(struct intel_gt *gt)
 	enum intel_platform platform = INTEL_INFO(gt->i915)->platform;
 
 	switch (platform) {
+	case INTEL_METEORLAKE:
+		return 1;
 	default:
 		return 1;
 	}
 }
 
+static u32 __oam_engine_group(struct intel_engine_cs *engine)
+{
+	enum intel_platform platform = INTEL_INFO(engine->i915)->platform;
+	struct intel_gt *gt = engine->gt;
+	u32 group = PERF_GROUP_INVALID;
+
+	switch (platform) {
+	case INTEL_METEORLAKE:
+		/*
+		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
+		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
+		 */
+		drm_WARN_ON(&engine->i915->drm,
+			    engine->gt->type != GT_MEDIA);
+
+		group = PERF_GROUP_OAM_SAMEDIA_0;
+		break;
+	default:
+		break;
+	}
+
+	drm_WARN_ON(&gt->i915->drm, group >= __num_perf_groups_per_gt(gt));
+
+	return group;
+}
+
 static u32 __oa_engine_group(struct intel_engine_cs *engine)
 {
 	if (!engine_supports_oa(engine))
@@ -4860,11 +4982,58 @@  static u32 __oa_engine_group(struct intel_engine_cs *engine)
 	case RENDER_CLASS:
 		return PERF_GROUP_OAG;
 
+	case VIDEO_DECODE_CLASS:
+	case VIDEO_ENHANCEMENT_CLASS:
+		return __oam_engine_group(engine);
+
 	default:
 		return PERF_GROUP_INVALID;
 	}
 }
 
+static struct i915_perf_regs __oam_regs(u32 base)
+{
+	return (struct i915_perf_regs) {
+		base,
+		GEN12_OAM_HEAD_POINTER(base),
+		GEN12_OAM_TAIL_POINTER(base),
+		GEN12_OAM_BUFFER(base),
+		GEN12_OAM_CONTEXT_CONTROL(base),
+		GEN12_OAM_CONTROL(base),
+		GEN12_OAM_DEBUG(base),
+		GEN12_OAM_STATUS(base),
+		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
+	};
+}
+
+static struct i915_perf_regs __oag_regs(void)
+{
+	return (struct i915_perf_regs) {
+		0,
+		GEN12_OAG_OAHEADPTR,
+		GEN12_OAG_OATAILPTR,
+		GEN12_OAG_OABUFFER,
+		GEN12_OAG_OAGLBCTXCTRL,
+		GEN12_OAG_OACONTROL,
+		GEN12_OAG_OA_DEBUG,
+		GEN12_OAG_OASTATUS,
+		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
+	};
+}
+
+static void oa_init_regs(struct intel_gt *gt, u32 id)
+{
+	struct i915_perf_group *group = &gt->perf.group[id];
+	struct i915_perf_regs *regs = &group->regs;
+
+	if (id == PERF_GROUP_OAG && gt->type != GT_MEDIA)
+		*regs = __oag_regs();
+	else if (IS_METEORLAKE(gt->i915))
+		*regs = __oam_regs(mtl_oa_base[id]);
+	else
+		drm_WARN(&gt->i915->drm, 1, "Unsupported platform for OA\n");
+}
+
 static void oa_init_groups(struct intel_gt *gt)
 {
 	int i, num_groups = gt->perf.num_perf_groups;
@@ -4881,6 +5050,24 @@  static void oa_init_groups(struct intel_gt *gt)
 		g->oa_unit_id = perf->oa_unit_ids++;
 
 		g->gt = gt;
+		oa_init_regs(gt, i);
+		g->fw_domains = FORCEWAKE_ALL;
+		if (i == PERF_GROUP_OAG) {
+			g->type = TYPE_OAG;
+
+			/*
+			 * Enabling all fw domains for OAG caps the max GT
+			 * frequency to media FF max. This could be less than
+			 * what the user sets through the sysfs and perf
+			 * measurements could be skewed. Since some platforms
+			 * have separate OAM units to measure media perf, do not
+			 * enable media fw domains for OAG.
+			 */
+			if (HAS_OAM(gt->i915))
+				g->fw_domains = FORCEWAKE_GT | FORCEWAKE_RENDER;
+		} else {
+			g->type = TYPE_OAM;
+		}
 	}
 }
 
@@ -4970,9 +5157,15 @@  static void oa_init_supported_formats(struct i915_perf *perf)
 		break;
 
 	case INTEL_DG2:
+		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
+		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
+		break;
+
 	case INTEL_METEORLAKE:
 		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
 		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u64_B8_C8);
+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u32_B8_C8);
 		break;
 
 	default:
@@ -5217,8 +5410,10 @@  int i915_perf_ioctl_version(void)
 	 *
 	 * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
 	 *    DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
+	 *
+	 * 7: Add support for video decode and enhancement classes.
 	 */
-	return 6;
+	return 7;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
index 381d94101610..ba103875e19f 100644
--- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
+++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
@@ -138,4 +138,82 @@ 
 #define   GEN12_SQCNT1_PMON_ENABLE		REG_BIT(30)
 #define   GEN12_SQCNT1_OABPC			REG_BIT(29)
 
+/* Gen12 OAM unit */
+#define GEN12_OAM_HEAD_POINTER_OFFSET   (0x1a0)
+#define  GEN12_OAM_HEAD_POINTER_MASK    0xffffffc0
+
+#define GEN12_OAM_TAIL_POINTER_OFFSET   (0x1a4)
+#define  GEN12_OAM_TAIL_POINTER_MASK    0xffffffc0
+
+#define GEN12_OAM_BUFFER_OFFSET         (0x1a8)
+#define  GEN12_OAM_BUFFER_SIZE_MASK     (0x7)
+#define  GEN12_OAM_BUFFER_SIZE_SHIFT    (3)
+#define  GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */
+
+#define GEN12_OAM_CONTEXT_CONTROL_OFFSET              (0x1bc)
+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2
+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE       REG_BIT(1)
+#define  GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME     REG_BIT(0)
+
+#define GEN12_OAM_CONTROL_OFFSET                (0x194)
+#define  GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1
+#define  GEN12_OAM_CONTROL_COUNTER_ENABLE       REG_BIT(0)
+
+#define GEN12_OAM_DEBUG_OFFSET                      (0x198)
+#define  GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT         REG_BIT(12)
+#define  GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO          REG_BIT(6)
+#define  GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS  REG_BIT(5)
+#define  GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS     REG_BIT(2)
+#define  GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1)
+
+#define GEN12_OAM_STATUS_OFFSET            (0x19c)
+#define  GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2)
+#define  GEN12_OAM_STATUS_BUFFER_OVERFLOW  REG_BIT(1)
+#define  GEN12_OAM_STATUS_REPORT_LOST      REG_BIT(0)
+
+#define GEN12_OAM_MMIO_TRG_OFFSET	(0x1d0)
+
+#define GEN12_OAM_MMIO_TRG(base) \
+	_MMIO((base) + GEN12_OAM_MMIO_TRG_OFFSET)
+
+#define GEN12_OAM_HEAD_POINTER(base) \
+	_MMIO((base) + GEN12_OAM_HEAD_POINTER_OFFSET)
+#define GEN12_OAM_TAIL_POINTER(base) \
+	_MMIO((base) + GEN12_OAM_TAIL_POINTER_OFFSET)
+#define GEN12_OAM_BUFFER(base) \
+	_MMIO((base) + GEN12_OAM_BUFFER_OFFSET)
+#define GEN12_OAM_CONTEXT_CONTROL(base) \
+	_MMIO((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET)
+#define GEN12_OAM_CONTROL(base) \
+	_MMIO((base) + GEN12_OAM_CONTROL_OFFSET)
+#define GEN12_OAM_DEBUG(base) \
+	_MMIO((base) + GEN12_OAM_DEBUG_OFFSET)
+#define GEN12_OAM_STATUS(base) \
+	_MMIO((base) + GEN12_OAM_STATUS_OFFSET)
+
+#define GEN12_OAM_CEC0_0_OFFSET		(0x40)
+#define GEN12_OAM_CEC7_1_OFFSET		(0x7c)
+#define GEN12_OAM_CEC0_0(base) \
+	_MMIO((base) + GEN12_OAM_CEC0_0_OFFSET)
+#define GEN12_OAM_CEC7_1(base) \
+	_MMIO((base) + GEN12_OAM_CEC7_1_OFFSET)
+
+#define GEN12_OAM_STARTTRIG1_OFFSET	(0x00)
+#define GEN12_OAM_STARTTRIG8_OFFSET	(0x1c)
+#define GEN12_OAM_STARTTRIG1(base) \
+	_MMIO((base) + GEN12_OAM_STARTTRIG1_OFFSET)
+#define GEN12_OAM_STARTTRIG8(base) \
+	_MMIO((base) + GEN12_OAM_STARTTRIG8_OFFSET)
+
+#define GEN12_OAM_REPORTTRIG1_OFFSET	(0x20)
+#define GEN12_OAM_REPORTTRIG8_OFFSET	(0x3c)
+#define GEN12_OAM_REPORTTRIG1(base) \
+	_MMIO((base) + GEN12_OAM_REPORTTRIG1_OFFSET)
+#define GEN12_OAM_REPORTTRIG8(base) \
+	_MMIO((base) + GEN12_OAM_REPORTTRIG8_OFFSET)
+
+#define GEN12_OAM_PERF_COUNTER_B0_OFFSET	(0x84)
+#define GEN12_OAM_PERF_COUNTER_B(base, idx) \
+	_MMIO((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx))
+
 #endif /* __INTEL_PERF_OA_REGS__ */
diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
index 8ccb0b89d019..5b2c3bab60f8 100644
--- a/drivers/gpu/drm/i915/i915_perf_types.h
+++ b/drivers/gpu/drm/i915/i915_perf_types.h
@@ -20,6 +20,7 @@ 
 #include "gt/intel_engine_types.h"
 #include "gt/intel_sseu.h"
 #include "i915_reg_defs.h"
+#include "intel_uncore.h"
 #include "intel_wakeref.h"
 
 struct drm_i915_private;
@@ -33,6 +34,7 @@  struct intel_engine_cs;
 
 enum {
 	PERF_GROUP_OAG = 0,
+	PERF_GROUP_OAM_SAMEDIA_0 = 0,
 
 	PERF_GROUP_MAX,
 	PERF_GROUP_INVALID = U32_MAX,
@@ -43,9 +45,27 @@  enum report_header {
 	HDR_64_BIT,
 };
 
+struct i915_perf_regs {
+	u32 base;
+	i915_reg_t oa_head_ptr;
+	i915_reg_t oa_tail_ptr;
+	i915_reg_t oa_buffer;
+	i915_reg_t oa_ctx_ctrl;
+	i915_reg_t oa_ctrl;
+	i915_reg_t oa_debug;
+	i915_reg_t oa_status;
+	u32 oa_ctrl_counter_format_shift;
+};
+
+enum {
+	TYPE_OAG,
+	TYPE_OAM,
+};
+
 struct i915_oa_format {
 	u32 format;
 	int size;
+	int type;
 	enum report_header header;
 };
 
@@ -317,6 +337,11 @@  struct i915_perf_stream {
 		 * @tail: The last verified tail that can be read by userspace.
 		 */
 		u32 tail;
+
+		/**
+		 * @group: The group object for this OA buffer.
+		 */
+		struct i915_perf_group *group;
 	} oa_buffer;
 
 	/**
@@ -431,6 +456,21 @@  struct i915_perf_group {
 	 * @engine_mask: A mask of engines using a single OA buffer.
 	 */
 	intel_engine_mask_t engine_mask;
+
+	/*
+	 * @regs: OA buffer register group for programming the OA unit.
+	 */
+	struct i915_perf_regs regs;
+
+	/*
+	 * @type: Type of OA buffer, OAM, OAG etc.
+	 */
+	int type;
+
+	/*
+	 * @fw_domains: forcewake domains required for this group.
+	 */
+	enum forcewake_domains fw_domains;
 };
 
 struct i915_perf_gt {
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
index 80bda653d61b..45e218327f44 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -166,6 +166,7 @@  enum intel_ppgtt_type {
 	func(has_mslice_steering); \
 	func(has_oa_bpc_reporting); \
 	func(has_oa_slice_contrib_limits); \
+	func(has_oam); \
 	func(has_one_eu_per_fuse_bit); \
 	func(has_pxp); \
 	func(has_rc6); \
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index b6922b52d85c..70bfa6530dbc 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2676,6 +2676,10 @@  enum drm_i915_oa_format {
 	I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
 	I915_OA_FORMAT_A24u40_A14u32_B8_C8,
 
+	/* MTL OAM */
+	I915_OAM_FORMAT_MPEC8u64_B8_C8,
+	I915_OAM_FORMAT_MPEC8u32_B8_C8,
+
 	I915_OA_FORMAT_MAX	    /* non-ABI */
 };