diff mbox series

[2/3] drm/i915/tgl: Add perf support on TGL

Message ID 20191014185531.62855-2-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers | expand

Commit Message

Umesh Nerlige Ramappa Oct. 14, 2019, 6:55 p.m. UTC
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

The design of the OA unit has been split into several units. We now
have a global unit (OAG) and a render specific unit (OAR). This leads
to some changes on how we program things. Some details :

OAR:
  - has its own set of counter registers, they are per-context
    saved/restored
  - counters are not written to the circular OA buffer
  - a snapshot of the counters can be acquired with
    MI_RECORD_PERF_COUNT, or a single counter can be read with
    MI_STORE_REGISTER_MEM.

OAG:
  - has global counters that increment across context switches
  - counters are written into the circular OA buffer (if requested)

v2: Fix checkpatch warnings on code style (Lucas)
v3: (Umesh)
  - Update register from which tail, status and head are read
  - Update logic to sample context reports
  - Update whitelist mux and b counter regs

BSpec: 28727, 30021

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/i915/Makefile         |   3 +-
 drivers/gpu/drm/i915/i915_perf.c      | 280 +++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h       | 103 ++++++++++
 drivers/gpu/drm/i915/oa/i915_oa_tgl.c | 121 +++++++++++
 drivers/gpu/drm/i915/oa/i915_oa_tgl.h |  16 ++
 5 files changed, 492 insertions(+), 31 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/oa/i915_oa_tgl.c
 create mode 100644 drivers/gpu/drm/i915/oa/i915_oa_tgl.h
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index e791d9323b51..0ec9fee58baa 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -242,7 +242,8 @@  i915-y += \
 	oa/i915_oa_cflgt2.o \
 	oa/i915_oa_cflgt3.o \
 	oa/i915_oa_cnl.o \
-	oa/i915_oa_icl.o
+	oa/i915_oa_icl.o \
+	oa/i915_oa_tgl.o
 i915-y += i915_perf.o
 
 # Post-mortem debug and GPU hang state capture
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index d54d3a41ceca..df3b6976ba5b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -217,6 +217,7 @@ 
 #include "oa/i915_oa_cflgt3.h"
 #include "oa/i915_oa_cnl.h"
 #include "oa/i915_oa_icl.h"
+#include "oa/i915_oa_tgl.h"
 
 /* HW requires this to be a power of two, between 128k and 16M, though driver
  * is currently generally designed assuming the largest 16M size is used such
@@ -292,7 +293,8 @@  static u32 i915_perf_stream_paranoid = true;
 #define INVALID_CTX_ID 0xffffffff
 
 /* On Gen8+ automatically triggered OA reports include a 'reason' field... */
-#define OAREPORT_REASON_MASK           0x3f
+#define OAREPORT_REASON_MASK           (IS_GEN(stream->perf->i915, 12) ? \
+					0x7f : 0x3f)
 #define OAREPORT_REASON_SHIFT          19
 #define OAREPORT_REASON_TIMER          (1<<0)
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
@@ -338,6 +340,10 @@  static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 };
 
+static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
+	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 
 /**
@@ -415,6 +421,14 @@  static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
 	kfree(oa_bo);
 }
 
+static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
+	       GEN12_OAG_OATAILPTR_MASK;
+}
+
 static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
@@ -535,7 +549,7 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 				aging_tail = hw_tail;
 			stream->oa_buffer.aging_timestamp = now;
 		} else {
-			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
+			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %x\n",
 				  hw_tail);
 		}
 	}
@@ -754,7 +768,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * Note: that we don't clear the valid_ctx_bit so userspace can
 		 * understand that the ID has been squashed by the kernel.
 		 */
-		if (!(report32[0] & stream->perf->gen8_valid_ctx_bit))
+		if (!(report32[0] & stream->perf->gen8_valid_ctx_bit) &&
+		    INTEL_GEN(stream->perf->i915) <= 11)
 			ctx_id = report32[2] = INVALID_CTX_ID;
 
 		/*
@@ -821,6 +836,11 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	}
 
 	if (start_offset != *offset) {
+		i915_reg_t oaheadptr;
+
+		oaheadptr = IS_GEN(stream->perf->i915, 12) ?
+			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
+
 		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
 		/*
@@ -828,9 +848,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * relative to oa_buf_base so put back here...
 		 */
 		head += gtt_offset;
-
-		intel_uncore_write(uncore, GEN8_OAHEADPTR,
-				   head & GEN8_OAHEADPTR_MASK);
+		intel_uncore_write(uncore, oaheadptr,
+				   head & GEN12_OAG_OAHEADPTR_MASK);
 		stream->oa_buffer.head = head;
 
 		spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
@@ -866,12 +885,16 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 {
 	struct intel_uncore *uncore = stream->uncore;
 	u32 oastatus;
+	i915_reg_t oastatus_reg;
 	int ret;
 
 	if (WARN_ON(!stream->oa_buffer.vaddr))
 		return -EIO;
 
-	oastatus = intel_uncore_read(uncore, GEN8_OASTATUS);
+	oastatus_reg = IS_GEN(stream->perf->i915, 12) ?
+		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
+
+	oastatus = intel_uncore_read(uncore, oastatus_reg);
 
 	/*
 	 * We treat OABUFFER_OVERFLOW as a significant error:
@@ -903,7 +926,7 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 		 * Note: .oa_enable() is expected to re-init the oabuffer and
 		 * reset GEN8_OASTATUS for us
 		 */
-		oastatus = intel_uncore_read(uncore, GEN8_OASTATUS);
+		oastatus = intel_uncore_read(uncore, oastatus_reg);
 	}
 
 	if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
@@ -911,7 +934,7 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
 		if (ret)
 			return ret;
-		intel_uncore_write(uncore, GEN8_OASTATUS,
+		intel_uncore_write(uncore, oastatus_reg,
 				   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 	}
 
@@ -1485,6 +1508,67 @@  static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
 	stream->pollin = false;
 }
 
+static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
+	unsigned long flags;
+
+	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
+
+	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
+	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
+			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
+	stream->oa_buffer.head = gtt_offset;
+
+	/*
+	 * PRM says:
+	 *
+	 *  "This MMIO must be set before the OATAILPTR
+	 *  register and after the OAHEADPTR register. This is
+	 *  to enable proper functionality of the overflow
+	 *  bit."
+	 */
+	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
+			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
+
+	/* Mark that we need updated tail pointers to read from... */
+	stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
+	stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
+
+	/*
+	 * Reset state used to recognise context switches, affecting which
+	 * reports we will forward to userspace while filtering for a single
+	 * context.
+	 */
+	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
+
+	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
+
+	/*
+	 * NB: although the OA buffer will initially be allocated
+	 * zeroed via shmfs (and so this memset is redundant when
+	 * first allocating), we may re-init the OA buffer, either
+	 * when re-enabling a stream or in error/reset paths.
+	 *
+	 * The reason we clear the buffer for each re-init is for the
+	 * sanity check in gen8_append_oa_reports() that looks at the
+	 * reason field to make sure it's non-zero which relies on
+	 * the assumption that new reports are being written to zeroed
+	 * memory...
+	 */
+	memset(stream->oa_buffer.vaddr, 0,
+	       stream->oa_buffer.vma->size);
+
+	/*
+	 * Maybe make ->pollin per-stream state if we support multiple
+	 * concurrent streams in the future.
+	 */
+	stream->pollin = false;
+}
+
 static int alloc_oa_buffer(struct i915_perf_stream *stream)
 {
 	struct drm_i915_gem_object *bo;
@@ -1991,7 +2075,7 @@  gen8_update_reg_state_unlocked(const struct intel_context *ce,
 		(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
 		GEN8_OA_COUNTER_RESUME;
 
-	for (i = 0; i < ARRAY_SIZE(flex_regs); i++)
+	for (i = 0; !!ctx_flexeu0 && i < ARRAY_SIZE(flex_regs); i++)
 		reg_state[ctx_flexeu0 + i * 2 + 1] =
 			oa_config_flex_reg(stream->oa_config, flex_regs[i]);
 
@@ -2148,8 +2232,8 @@  static int gen8_configure_context(struct i915_gem_context *ctx,
  *
  * Note: it's only the RCS/Render context that has any OA state.
  */
-static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
-				       const struct i915_oa_config *oa_config)
+static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
+				      const struct i915_oa_config *oa_config)
 {
 	struct drm_i915_private *i915 = stream->perf->i915;
 	/* The MMIO offsets for Flex EU registers aren't contiguous */
@@ -2161,11 +2245,9 @@  static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 			CTX_R_PWR_CLK_STATE,
 		},
 		{
-			GEN8_OACTXCONTROL,
+			IS_GEN(i915, 12) ?
+			GEN12_OAR_OACONTROL: GEN8_OACTXCONTROL,
 			stream->perf->ctx_oactxctrl_offset + 1,
-			((stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
-			 (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
-			 GEN8_OA_COUNTER_RESUME)
 		},
 		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
 		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
@@ -2178,9 +2260,23 @@  static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 #undef ctx_flexeuN
 	struct intel_engine_cs *engine;
 	struct i915_gem_context *ctx, *cn;
+	size_t array_size = IS_GEN(i915, 12) ? 2 : ARRAY_SIZE(regs);
 	int i, err;
 
-	for (i = 2; i < ARRAY_SIZE(regs); i++)
+	if (IS_GEN(i915, 12)) {
+		u32 format = stream->oa_buffer.format;
+
+		regs[1].value =
+			(format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) |
+			(oa_config ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0);
+	} else {
+		regs[1].value =
+			(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+			(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+			GEN8_OA_COUNTER_RESUME;
+	}
+
+	for (i = 2; !!ctx_flexeu0 && i < array_size; i++)
 		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
 
 	lockdep_assert_held(&stream->perf->lock);
@@ -2211,7 +2307,7 @@  static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 
 		spin_unlock(&i915->gem.contexts.lock);
 
-		err = gen8_configure_context(ctx, regs, ARRAY_SIZE(regs));
+		err = gen8_configure_context(ctx, regs, array_size);
 		if (err) {
 			i915_gem_context_put(ctx);
 			return err;
@@ -2236,7 +2332,7 @@  static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 
 		regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
 
-		err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs));
+		err = gen8_modify_self(ce, regs, array_size);
 		if (err)
 			return err;
 	}
@@ -2284,7 +2380,45 @@  static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	 * to make sure all slices/subslices are ON before writing to NOA
 	 * registers.
 	 */
-	ret = gen8_configure_all_contexts(stream, oa_config);
+	ret = lrc_configure_all_contexts(stream, oa_config);
+	if (ret)
+		return ret;
+
+	return emit_oa_config(stream, oa_context(stream));
+}
+
+static int gen12_enable_metric_set(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	struct i915_oa_config *oa_config = stream->oa_config;
+	bool periodic = stream->periodic;
+	u32 period_exponent = stream->period_exponent;
+	int ret;
+
+	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
+			   /* Disable clk ratio reports, like previous Gens. */
+			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
+			   /*
+			    * If the user didn't require OA reports, instruct the
+			    * hardware not to emit ctx switch reports.
+			    */
+			   !(stream->sample_flags & SAMPLE_OA_REPORT) ?
+			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS) :
+			   _MASKED_BIT_DISABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS));
+
+	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
+			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
+			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
+			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
+			    : 0);
+
+	/*
+	 * Update all contexts prior writing the mux configurations as we need
+	 * to make sure all slices/subslices are ON before writing to NOA
+	 * registers.
+	 */
+	ret = lrc_configure_all_contexts(stream, oa_config);
 	if (ret)
 		return ret;
 
@@ -2296,7 +2430,7 @@  static void gen8_disable_metric_set(struct i915_perf_stream *stream)
 	struct intel_uncore *uncore = stream->uncore;
 
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(stream, NULL);
+	lrc_configure_all_contexts(stream, NULL);
 
 	intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
 }
@@ -2306,7 +2440,7 @@  static void gen10_disable_metric_set(struct i915_perf_stream *stream)
 	struct intel_uncore *uncore = stream->uncore;
 
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(stream, NULL);
+	lrc_configure_all_contexts(stream, NULL);
 
 	/* Make sure we disable noa to save power. */
 	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
@@ -2368,6 +2502,25 @@  static void gen8_oa_enable(struct i915_perf_stream *stream)
 			   GEN8_OA_COUNTER_ENABLE);
 }
 
+static void gen12_oa_enable(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	u32 report_format = stream->oa_buffer.format;
+
+	/*
+	 * If we don't want OA reports from the OA buffer, then we don't even
+	 * need to program the OAG unit.
+	 */
+	if (!(stream->sample_flags & SAMPLE_OA_REPORT))
+		return;
+
+	gen12_init_oa_buffer(stream);
+
+	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
+			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
+			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
+}
+
 /**
  * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -2409,6 +2562,18 @@  static void gen8_oa_disable(struct i915_perf_stream *stream)
 		DRM_ERROR("wait for OA to be disabled timed out\n");
 }
 
+static void gen12_oa_disable(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
+	if (intel_wait_for_register(uncore,
+				    GEN12_OAG_OACONTROL,
+				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
+				    50))
+		DRM_ERROR("wait for OA to be disabled timed out\n");
+}
+
 /**
  * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -2608,7 +2773,7 @@  void i915_oa_init_reg_state(const struct intel_context *ce,
 {
 	struct i915_perf_stream *stream;
 
-	/* perf.exclusive_stream serialised by gen8_configure_all_contexts() */
+	/* perf.exclusive_stream serialised by lrc_configure_all_contexts() */
 	lockdep_assert_held(&ce->pin_mutex);
 
 	if (engine->class != RENDER_CLASS)
@@ -3037,16 +3202,24 @@  i915_perf_open_ioctl_locked(struct i915_perf *perf,
 	 * rest of the system, which we consider acceptable for a
 	 * non-privileged client.
 	 *
-	 * For Gen8+ the OA unit no longer supports clock gating off for a
+	 * For Gen8->11 the OA unit no longer supports clock gating off for a
 	 * specific context and the kernel can't securely stop the counters
 	 * from updating as system-wide / global values. Even though we can
 	 * filter reports based on the included context ID we can't block
 	 * clients from seeing the raw / global counter values via
 	 * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
 	 * enable the OA unit by default.
+	 *
+	 * For Gen12+ we gain a new OAR unit that only monitors the RCS on a
+	 * per context basis. So we can relax requirements there if the user
+	 * doesn't request global stream access (i.e. query based sampling
+	 * using MI_RECORD_PERF_COUNT.
 	 */
 	if (IS_HASWELL(perf->i915) && specific_ctx)
 		privileged_op = false;
+	else if (IS_GEN(perf->i915, 12) && specific_ctx &&
+		 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
+		privileged_op = false;
 
 	/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
 	 * we check a dev.i915.perf_stream_paranoid sysctl option
@@ -3358,7 +3531,9 @@  void i915_perf_register(struct drm_i915_private *i915)
 
 	sysfs_attr_init(&perf->test_config.sysfs_metric_id.attr);
 
-	if (INTEL_GEN(i915) >= 11) {
+	if (IS_TIGERLAKE(i915)) {
+		i915_perf_load_test_config_tgl(i915);
+	} else if (INTEL_GEN(i915) >= 11) {
 		i915_perf_load_test_config_icl(i915);
 	} else if (IS_CANNONLAKE(i915)) {
 		i915_perf_load_test_config_cnl(i915);
@@ -3509,6 +3684,28 @@  static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 	       ADDR_IN_RANGE(addr, 0x182300, 0x1823A4);
 }
 
+static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
+{
+	return REG_IN_RANGE(addr, GEN12_OAG_OASTARTTRIG1, GEN12_OAG_OASTARTTRIG8) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_OAREPORTTRIG1, GEN12_OAG_OAREPORTTRIG8) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_CEC0_0, GEN12_OAG_CEC7_1) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_SCEC0_0, GEN12_OAG_SCEC7_1) ||
+	       REG_EQUAL(addr, GEN12_OAA_DBG_REG) ||
+	       REG_EQUAL(addr, GEN12_OAG_OA_PESS) ||
+	       REG_EQUAL(addr, GEN12_OAG_SPCTR_CNF);
+}
+
+static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
+{
+	return REG_EQUAL(addr, NOA_WRITE) ||
+	       REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
+	       REG_EQUAL(addr, GDT_CHICKEN_BITS) ||
+	       REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
+	       REG_EQUAL(addr, RPM_CONFIG0) ||
+	       REG_EQUAL(addr, RPM_CONFIG1) ||
+	       REG_IN_RANGE(addr, NOA_CONFIG(0), NOA_CONFIG(8));
+}
+
 static u32 mask_reg_value(u32 reg, u32 val)
 {
 	/* HALF_SLICE_CHICKEN2 is programmed with a the
@@ -3902,14 +4099,11 @@  void i915_perf_init(struct drm_i915_private *i915)
 		 * worth the complexity to maintain now that BDW+ enable
 		 * execlist mode by default.
 		 */
-		perf->oa_formats = gen8_plus_oa_formats;
-
-		perf->ops.oa_enable = gen8_oa_enable;
-		perf->ops.oa_disable = gen8_oa_disable;
 		perf->ops.read = gen8_oa_read;
-		perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 		if (IS_GEN_RANGE(i915, 8, 9)) {
+			perf->oa_formats = gen8_plus_oa_formats;
+
 			perf->ops.is_valid_b_counter_reg =
 				gen7_is_valid_b_counter_addr;
 			perf->ops.is_valid_mux_reg =
@@ -3922,8 +4116,11 @@  void i915_perf_init(struct drm_i915_private *i915)
 					chv_is_valid_mux_addr;
 			}
 
+			perf->ops.oa_enable = gen8_oa_enable;
+			perf->ops.oa_disable = gen8_oa_disable;
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen8_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 			if (IS_GEN(i915, 8)) {
 				perf->ctx_oactxctrl_offset = 0x120;
@@ -3937,6 +4134,8 @@  void i915_perf_init(struct drm_i915_private *i915)
 				perf->gen8_valid_ctx_bit = BIT(16);
 			}
 		} else if (IS_GEN_RANGE(i915, 10, 11)) {
+			perf->oa_formats = gen8_plus_oa_formats;
+
 			perf->ops.is_valid_b_counter_reg =
 				gen7_is_valid_b_counter_addr;
 			perf->ops.is_valid_mux_reg =
@@ -3944,8 +4143,11 @@  void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.is_valid_flex_reg =
 				gen8_is_valid_flex_addr;
 
+			perf->ops.oa_enable = gen8_oa_enable;
+			perf->ops.oa_disable = gen8_oa_disable;
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen10_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 			if (IS_GEN(i915, 10)) {
 				perf->ctx_oactxctrl_offset = 0x128;
@@ -3955,6 +4157,24 @@  void i915_perf_init(struct drm_i915_private *i915)
 				perf->ctx_flexeu0_offset = 0x78e;
 			}
 			perf->gen8_valid_ctx_bit = BIT(16);
+		} else if (IS_GEN(i915, 12)) {
+			perf->oa_formats = gen12_oa_formats;
+
+			perf->ops.is_valid_b_counter_reg =
+				gen12_is_valid_b_counter_addr;
+			perf->ops.is_valid_mux_reg =
+				gen12_is_valid_mux_addr;
+			perf->ops.is_valid_flex_reg =
+				gen8_is_valid_flex_addr;
+
+			perf->ops.oa_enable = gen12_oa_enable;
+			perf->ops.oa_disable = gen12_oa_disable;
+			perf->ops.enable_metric_set = gen12_enable_metric_set;
+			perf->ops.disable_metric_set = gen10_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
+
+			perf->ctx_flexeu0_offset = 0;
+			perf->ctx_oactxctrl_offset = 0x144;
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e24991e54897..2e11963f3eb5 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -684,6 +684,45 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OABUFFER_SIZE_8M    (6 << 3)
 #define OABUFFER_SIZE_16M   (7 << 3)
 
+/* Gen12 OAR unit */
+#define GEN12_OAR_OACONTROL _MMIO(0x2960)
+#define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
+#define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
+
+#define GEN12_OACTXCONTROL _MMIO(0x2360)
+#define GEN12_OAR_OASTATUS _MMIO(0x2968)
+
+/* Gen12 OAG unit */
+#define GEN12_OAG_OAHEADPTR _MMIO(0xdb00)
+#define  GEN12_OAG_OAHEADPTR_MASK 0xffffffc0
+#define GEN12_OAG_OATAILPTR _MMIO(0xdb04)
+#define  GEN12_OAG_OATAILPTR_MASK 0xffffffc0
+
+#define GEN12_OAG_OABUFFER  _MMIO(0xdb08)
+#define  GEN12_OAG_OABUFFER_BUFFER_SIZE_MASK  (0x7)
+#define  GEN12_OAG_OABUFFER_BUFFER_SIZE_SHIFT (3)
+#define  GEN12_OAG_OABUFFER_MEMORY_SELECT     (1 << 0) /* 0: PPGTT, 1: GGTT */
+
+#define GEN12_OAG_OAGLBCTXCTRL _MMIO(0x2b28)
+#define  GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT 2
+#define  GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE       (1 << 1)
+#define  GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME     (1 << 0)
+
+#define GEN12_OAG_OACONTROL _MMIO(0xdaf4)
+#define  GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT 2
+#define  GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE       (1 << 0)
+
+#define GEN12_OAG_OA_DEBUG _MMIO(0xdaf8)
+#define  GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO          (1 << 6)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS  (1 << 5)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
+
+#define GEN12_OAG_OASTATUS _MMIO(0xdafc)
+#define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
+#define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
+#define  GEN12_OAG_OASTATUS_REPORT_LOST      (1 << 0)
+
 /*
  * Flexible, Aggregate EU Counter Registers.
  * Note: these aren't contiguous
@@ -920,6 +959,26 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
 #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
 
+/* Same layout as OASTARTTRIGX */
+#define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
+#define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
+#define GEN12_OAG_OASTARTTRIG3 _MMIO(0xd908)
+#define GEN12_OAG_OASTARTTRIG4 _MMIO(0xd90c)
+#define GEN12_OAG_OASTARTTRIG5 _MMIO(0xd910)
+#define GEN12_OAG_OASTARTTRIG6 _MMIO(0xd914)
+#define GEN12_OAG_OASTARTTRIG7 _MMIO(0xd918)
+#define GEN12_OAG_OASTARTTRIG8 _MMIO(0xd91c)
+
+/* Same layout as OAREPORTTRIGX */
+#define GEN12_OAG_OAREPORTTRIG1 _MMIO(0xd920)
+#define GEN12_OAG_OAREPORTTRIG2 _MMIO(0xd924)
+#define GEN12_OAG_OAREPORTTRIG3 _MMIO(0xd928)
+#define GEN12_OAG_OAREPORTTRIG4 _MMIO(0xd92c)
+#define GEN12_OAG_OAREPORTTRIG5 _MMIO(0xd930)
+#define GEN12_OAG_OAREPORTTRIG6 _MMIO(0xd934)
+#define GEN12_OAG_OAREPORTTRIG7 _MMIO(0xd938)
+#define GEN12_OAG_OAREPORTTRIG8 _MMIO(0xd93c)
+
 /* CECX_0 */
 #define OACEC_COMPARE_LESS_OR_EQUAL	6
 #define OACEC_COMPARE_NOT_EQUAL		5
@@ -936,6 +995,10 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OACEC_SELECT_PREV	(1 << 19)
 #define OACEC_SELECT_BOOLEAN	(2 << 19)
 
+/* 11-bit array 0: pass-through, 1: negated */
+#define GEN12_OASCEC_NEGATE_MASK  0x7ff
+#define GEN12_OASCEC_NEGATE_SHIFT 21
+
 /* CECX_1 */
 #define OACEC_MASK_MASK		    0xffff
 #define OACEC_CONSIDERATIONS_MASK   0xffff
@@ -958,6 +1021,42 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OACEC7_0 _MMIO(0x27a8)
 #define OACEC7_1 _MMIO(0x27ac)
 
+/* Same layout as CECX_Y */
+#define GEN12_OAG_CEC0_0 _MMIO(0xd940)
+#define GEN12_OAG_CEC0_1 _MMIO(0xd944)
+#define GEN12_OAG_CEC1_0 _MMIO(0xd948)
+#define GEN12_OAG_CEC1_1 _MMIO(0xd94c)
+#define GEN12_OAG_CEC2_0 _MMIO(0xd950)
+#define GEN12_OAG_CEC2_1 _MMIO(0xd954)
+#define GEN12_OAG_CEC3_0 _MMIO(0xd958)
+#define GEN12_OAG_CEC3_1 _MMIO(0xd95c)
+#define GEN12_OAG_CEC4_0 _MMIO(0xd960)
+#define GEN12_OAG_CEC4_1 _MMIO(0xd964)
+#define GEN12_OAG_CEC5_0 _MMIO(0xd968)
+#define GEN12_OAG_CEC5_1 _MMIO(0xd96c)
+#define GEN12_OAG_CEC6_0 _MMIO(0xd970)
+#define GEN12_OAG_CEC6_1 _MMIO(0xd974)
+#define GEN12_OAG_CEC7_0 _MMIO(0xd978)
+#define GEN12_OAG_CEC7_1 _MMIO(0xd97c)
+
+/* Same layout as CECX_Y + negate 11-bit array */
+#define GEN12_OAG_SCEC0_0 _MMIO(0xdc00)
+#define GEN12_OAG_SCEC0_1 _MMIO(0xdc04)
+#define GEN12_OAG_SCEC1_0 _MMIO(0xdc08)
+#define GEN12_OAG_SCEC1_1 _MMIO(0xdc0c)
+#define GEN12_OAG_SCEC2_0 _MMIO(0xdc10)
+#define GEN12_OAG_SCEC2_1 _MMIO(0xdc14)
+#define GEN12_OAG_SCEC3_0 _MMIO(0xdc18)
+#define GEN12_OAG_SCEC3_1 _MMIO(0xdc1c)
+#define GEN12_OAG_SCEC4_0 _MMIO(0xdc20)
+#define GEN12_OAG_SCEC4_1 _MMIO(0xdc24)
+#define GEN12_OAG_SCEC5_0 _MMIO(0xdc28)
+#define GEN12_OAG_SCEC5_1 _MMIO(0xdc2c)
+#define GEN12_OAG_SCEC6_0 _MMIO(0xdc30)
+#define GEN12_OAG_SCEC6_1 _MMIO(0xdc34)
+#define GEN12_OAG_SCEC7_0 _MMIO(0xdc38)
+#define GEN12_OAG_SCEC7_1 _MMIO(0xdc3c)
+
 /* OA perf counters */
 #define OA_PERFCNT1_LO      _MMIO(0x91B8)
 #define OA_PERFCNT1_HI      _MMIO(0x91BC)
@@ -1038,6 +1137,10 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define MICRO_BP3_COUNT_STATUS23	_MMIO(0x9838)
 #define MICRO_BP_FIRED_ARMED		_MMIO(0x983C)
 
+#define GEN12_OAA_DBG_REG _MMIO(0xdc44)
+#define GEN12_OAG_OA_PESS _MMIO(0x2b2c)
+#define GEN12_OAG_SPCTR_CNF _MMIO(0xdc40)
+
 #define GDT_CHICKEN_BITS    _MMIO(0x9840)
 #define   GT_NOA_ENABLE	    0x00000080
 
diff --git a/drivers/gpu/drm/i915/oa/i915_oa_tgl.c b/drivers/gpu/drm/i915/oa/i915_oa_tgl.c
new file mode 100644
index 000000000000..a29d93707345
--- /dev/null
+++ b/drivers/gpu/drm/i915/oa/i915_oa_tgl.c
@@ -0,0 +1,121 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Autogenerated file by GPU Top : https://github.com/rib/gputop
+ * DO NOT EDIT manually!
+ */
+
+#include <linux/sysfs.h>
+
+#include "i915_drv.h"
+#include "i915_oa_tgl.h"
+
+static const struct i915_oa_reg b_counter_config_test_oa[] = {
+	{ _MMIO(0xD920), 0x00000000 },
+	{ _MMIO(0xD900), 0x00000000 },
+	{ _MMIO(0xD904), 0xF0800000 },
+	{ _MMIO(0xD910), 0x00000000 },
+	{ _MMIO(0xD914), 0xF0800000 },
+	{ _MMIO(0xDC40), 0x00FF0000 },
+	{ _MMIO(0xD940), 0x00000004 },
+	{ _MMIO(0xD944), 0x0000FFFF },
+	{ _MMIO(0xDC00), 0x00000004 },
+	{ _MMIO(0xDC04), 0x0000FFFF },
+	{ _MMIO(0xD948), 0x00000003 },
+	{ _MMIO(0xD94C), 0x0000FFFF },
+	{ _MMIO(0xDC08), 0x00000003 },
+	{ _MMIO(0xDC0C), 0x0000FFFF },
+	{ _MMIO(0xD950), 0x00000007 },
+	{ _MMIO(0xD954), 0x0000FFFF },
+	{ _MMIO(0xDC10), 0x00000007 },
+	{ _MMIO(0xDC14), 0x0000FFFF },
+	{ _MMIO(0xD958), 0x00100002 },
+	{ _MMIO(0xD95C), 0x0000FFF7 },
+	{ _MMIO(0xDC18), 0x00100002 },
+	{ _MMIO(0xDC1C), 0x0000FFF7 },
+	{ _MMIO(0xD960), 0x00100002 },
+	{ _MMIO(0xD964), 0x0000FFCF },
+	{ _MMIO(0xDC20), 0x00100002 },
+	{ _MMIO(0xDC24), 0x0000FFCF },
+	{ _MMIO(0xD968), 0x00100082 },
+	{ _MMIO(0xD96C), 0x0000FFEF },
+	{ _MMIO(0xDC28), 0x00100082 },
+	{ _MMIO(0xDC2C), 0x0000FFEF },
+	{ _MMIO(0xD970), 0x001000C2 },
+	{ _MMIO(0xD974), 0x0000FFE7 },
+	{ _MMIO(0xDC30), 0x001000C2 },
+	{ _MMIO(0xDC34), 0x0000FFE7 },
+	{ _MMIO(0xD978), 0x00100001 },
+	{ _MMIO(0xD97C), 0x0000FFE7 },
+	{ _MMIO(0xDC38), 0x00100001 },
+	{ _MMIO(0xDC3C), 0x0000FFE7 },
+};
+
+static const struct i915_oa_reg flex_eu_config_test_oa[] = {
+};
+
+static const struct i915_oa_reg mux_config_test_oa[] = {
+	{ _MMIO(0x0D04), 0x00000200 },
+	{ _MMIO(0x9840), 0x00000000 },
+	{ _MMIO(0x9884), 0x00000000 },
+	{ _MMIO(0x9888), 0x280E0000 },
+	{ _MMIO(0x9888), 0x1E0E0147 },
+	{ _MMIO(0x9888), 0x180E0000 },
+	{ _MMIO(0x9888), 0x160E0000 },
+	{ _MMIO(0x9888), 0x1E0F1000 },
+	{ _MMIO(0x9888), 0x1E104000 },
+	{ _MMIO(0x9888), 0x2E020100 },
+	{ _MMIO(0x9888), 0x2C030004 },
+	{ _MMIO(0x9888), 0x38003000 },
+	{ _MMIO(0x9888), 0x1E0A8000 },
+	{ _MMIO(0x9884), 0x00000003 },
+	{ _MMIO(0x9888), 0x49110000 },
+	{ _MMIO(0x9888), 0x5D101400 },
+	{ _MMIO(0x9888), 0x1D140020 },
+	{ _MMIO(0x9888), 0x1D1103A3 },
+	{ _MMIO(0x9888), 0x01110000 },
+	{ _MMIO(0x9888), 0x61111000 },
+	{ _MMIO(0x9888), 0x1F128000 },
+	{ _MMIO(0x9888), 0x17100000 },
+	{ _MMIO(0x9888), 0x55100630 },
+	{ _MMIO(0x9888), 0x57100000 },
+	{ _MMIO(0x9888), 0x31100000 },
+	{ _MMIO(0x9884), 0x00000003 },
+	{ _MMIO(0x9888), 0x65100002 },
+	{ _MMIO(0x9884), 0x00000000 },
+	{ _MMIO(0x9888), 0x42000001 },
+};
+
+static ssize_t
+show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "1\n");
+}
+
+void
+i915_perf_load_test_config_tgl(struct drm_i915_private *dev_priv)
+{
+	strlcpy(dev_priv->perf.test_config.uuid,
+		"80a833f0-2504-4321-8894-e9277844ce7b",
+		sizeof(dev_priv->perf.test_config.uuid));
+	dev_priv->perf.test_config.id = 1;
+
+	dev_priv->perf.test_config.mux_regs = mux_config_test_oa;
+	dev_priv->perf.test_config.mux_regs_len = ARRAY_SIZE(mux_config_test_oa);
+
+	dev_priv->perf.test_config.b_counter_regs = b_counter_config_test_oa;
+	dev_priv->perf.test_config.b_counter_regs_len = ARRAY_SIZE(b_counter_config_test_oa);
+
+	dev_priv->perf.test_config.flex_regs = flex_eu_config_test_oa;
+	dev_priv->perf.test_config.flex_regs_len = ARRAY_SIZE(flex_eu_config_test_oa);
+
+	dev_priv->perf.test_config.sysfs_metric.name = "80a833f0-2504-4321-8894-e9277844ce7b";
+	dev_priv->perf.test_config.sysfs_metric.attrs = dev_priv->perf.test_config.attrs;
+
+	dev_priv->perf.test_config.attrs[0] = &dev_priv->perf.test_config.sysfs_metric_id.attr;
+
+	dev_priv->perf.test_config.sysfs_metric_id.attr.name = "id";
+	dev_priv->perf.test_config.sysfs_metric_id.attr.mode = 0444;
+	dev_priv->perf.test_config.sysfs_metric_id.show = show_test_oa_id;
+}
diff --git a/drivers/gpu/drm/i915/oa/i915_oa_tgl.h b/drivers/gpu/drm/i915/oa/i915_oa_tgl.h
new file mode 100644
index 000000000000..4c25f0be825c
--- /dev/null
+++ b/drivers/gpu/drm/i915/oa/i915_oa_tgl.h
@@ -0,0 +1,16 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Autogenerated file by GPU Top : https://github.com/rib/gputop
+ * DO NOT EDIT manually!
+ */
+
+#ifndef __I915_OA_TGL_H__
+#define __I915_OA_TGL_H__
+
+struct drm_i915_private;
+
+void i915_perf_load_test_config_tgl(struct drm_i915_private *dev_priv);
+
+#endif