diff mbox series

[v5,3/4] drm/i915/perf: prepare driver to receive multiple ctx handles

Message ID 20200409091706.690045-3-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series [v5,1/4] drm/i915/perf: break OA config buffer object in 2 | expand

Commit Message

Lionel Landwerlin April 9, 2020, 9:17 a.m. UTC
Make all the internal necessary changes before we flip the switch.

v2: Use an unlimited number of intel contexts (Chris)

v3: Handle GEM context with multiple RCS0 logical contexts (Chris)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c       | 556 +++++++++++++++----------
 drivers/gpu/drm/i915/i915_perf_types.h |  37 +-
 2 files changed, 359 insertions(+), 234 deletions(-)

Comments

Umesh Nerlige Ramappa April 13, 2020, 11:34 p.m. UTC | #1
On Thu, Apr 09, 2020 at 12:17:05PM +0300, Lionel Landwerlin wrote:
>Make all the internal necessary changes before we flip the switch.
>
>v2: Use an unlimited number of intel contexts (Chris)
>
>v3: Handle GEM context with multiple RCS0 logical contexts (Chris)
>
>Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>---
> drivers/gpu/drm/i915/i915_perf.c       | 556 +++++++++++++++----------
> drivers/gpu/drm/i915/i915_perf_types.h |  37 +-
> 2 files changed, 359 insertions(+), 234 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>index db526e0c160a..543d29cd5c14 100644
>--- a/drivers/gpu/drm/i915/i915_perf.c
>+++ b/drivers/gpu/drm/i915/i915_perf.c
>@@ -192,6 +192,7 @@
>  */
>
> #include <linux/anon_inodes.h>
>+#include <linux/bsearch.h>
> #include <linux/sizes.h>
> #include <linux/uuid.h>
>
>@@ -329,7 +330,8 @@ static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
>  * @single_context: Whether a single or all gpu contexts should be monitored
>  * @hold_preemption: Whether the preemption is disabled for the filtered
>  *                   context
>- * @ctx_handle: A gem ctx handle for use with @single_context
>+ * @n_ctx_handles: Length of @ctx_handles
>+ * @ctx_handles: An array of gem context handles
>  * @metrics_set: An ID for an OA unit metric set advertised via sysfs
>  * @oa_format: An OA unit HW report format
>  * @oa_periodic: Whether to enable periodic OA unit sampling
>@@ -349,9 +351,10 @@ static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
> struct perf_open_properties {
> 	u32 sample_flags;
>
>-	u64 single_context:1;
> 	u64 hold_preemption:1;
>-	u64 ctx_handle;
>+
>+	u32 n_ctx_handles;
>+	u32 *ctx_handles;
>
> 	/* OA sampling state */
> 	int metrics_set;
>@@ -625,6 +628,23 @@ static int append_oa_sample(struct i915_perf_stream *stream,
> 	return 0;
> }
>
>+static int ctx_id_equal(const void *key, const void *elem)
>+{
>+	const struct i915_perf_context_detail *details = elem;
>+
>+	return ((int)details->id) - *((int *)key);
>+}
>+
>+static inline bool ctx_id_match(struct i915_perf_stream *stream,
>+				u32 masked_ctx_id)
>+{
>+	return bsearch(&masked_ctx_id,
>+		       stream->pinned_ctxs,
>+		       stream->n_pinned_ctxs,
>+		       sizeof(*stream->pinned_ctxs),
>+		       ctx_id_equal) != NULL;
>+}
>+
> /**
>  * Copies all buffered OA reports into userspace read() buffer.
>  * @stream: An i915-perf stream opened for OA metrics
>@@ -736,7 +756,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> 			continue;
> 		}
>
>-		ctx_id = report32[2] & stream->specific_ctx_id_mask;
>+		ctx_id = report32[2] & stream->ctx_id_mask;
>
> 		/*
> 		 * Squash whatever is in the CTX_ID field if it's marked as
>@@ -781,26 +801,32 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> 		 * switches since it's not-uncommon for periodic samples to
> 		 * identify a switch before any 'context switch' report.
> 		 */
>-		if (!stream->perf->exclusive_stream->ctx ||
>-		    stream->specific_ctx_id == ctx_id ||
>-		    stream->oa_buffer.last_ctx_id == stream->specific_ctx_id ||
>-		    reason & OAREPORT_REASON_CTX_SWITCH) {
>-
>-			/*
>-			 * While filtering for a single context we avoid
>-			 * leaking the IDs of other contexts.
>-			 */
>-			if (stream->perf->exclusive_stream->ctx &&
>-			    stream->specific_ctx_id != ctx_id) {
>-				report32[2] = INVALID_CTX_ID;
>-			}
>-
>+		if (!stream->perf->exclusive_stream->n_ctxs) {
> 			ret = append_oa_sample(stream, buf, count, offset,
> 					       report);
> 			if (ret)
> 				break;
>+		} else {
>+			bool ctx_match = ctx_id != INVALID_CTX_ID &&
>+				ctx_id_match(stream, ctx_id);
>+
>+			if (ctx_match ||
>+			    stream->oa_buffer.last_ctx_match ||
>+			    reason & OAREPORT_REASON_CTX_SWITCH) {
>+				/*
>+				 * While filtering for a single context we avoid
>+				 * leaking the IDs of other contexts.
>+				 */
>+				if (!ctx_match)
>+					report32[2] = INVALID_CTX_ID;
>+
>+				ret = append_oa_sample(stream, buf, count, offset,
>+						       report);
>+				if (ret)
>+					break;
>+			}
>
>-			stream->oa_buffer.last_ctx_id = ctx_id;
>+			stream->oa_buffer.last_ctx_match = ctx_match;
> 		}
>
> 		/*
>@@ -1191,138 +1217,176 @@ static int i915_oa_read(struct i915_perf_stream *stream,
> 	return stream->perf->ops.read(stream, buf, count, offset);
> }
>
>-static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
>+static u32 get_ctx_id_mask(struct intel_engine_cs *engine)
> {
>-	struct i915_gem_engines_iter it;
>-	struct i915_gem_context *ctx = stream->ctx;
>-	struct intel_context *ce;
>-	int err;
>+	switch (INTEL_GEN(engine->i915)) {
>+	case 7:
>+		/*
>+		 * On Haswell we don't do any post processing of the reports
>+		 * and don't need to use the mask.
>+		 */
>+		return 0;
>
>-	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
>-		if (ce->engine != stream->engine) /* first match! */
>-			continue;
>+	case 8:
>+	case 9:
>+	case 10:
>+		if (intel_engine_in_execlists_submission_mode(engine))
>+			return (1U << GEN8_CTX_ID_WIDTH) - 1;
>
> 		/*
>-		 * As the ID is the gtt offset of the context's vma we
>-		 * pin the vma to ensure the ID remains fixed.
>+		 * GuC uses the top bit to signal proxy submission, so ignore
>+		 * that bit.
> 		 */
>-		err = intel_context_pin(ce);
>-		if (err == 0) {
>-			stream->pinned_ctx = ce;
>-			break;
>-		}
>-	}
>-	i915_gem_context_unlock_engines(ctx);
>+		return (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
>
>-	return stream->pinned_ctx;
>+	case 11:
>+	case 12:
>+		/*
>+		 * 0x7ff is used by idle context.
>+		 */
>+		BUILD_BUG_ON((GEN12_MAX_CONTEXT_HW_ID - 1) < NUM_CONTEXT_TAG);
>+		return ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
>+
>+	default:
>+		MISSING_CASE(INTEL_GEN(engine->i915));
>+		return 0;
>+	}
> }
>
>-/**
>- * oa_get_render_ctx_id - determine and hold ctx hw id
>- * @stream: An i915-perf stream opened for OA metrics
>- *
>- * Determine the render context hw id, and ensure it remains fixed for the
>- * lifetime of the stream. This ensures that we don't have to worry about
>- * updating the context ID in OACONTROL on the fly.
>- *
>- * Returns: zero on success or a negative error code
>- */
>-static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>+static u32 get_ctx_id(struct intel_context *ce, int idx)
> {
>-	struct intel_context *ce;
>-
>-	ce = oa_pin_context(stream);
>-	if (IS_ERR(ce))
>-		return PTR_ERR(ce);
>
> 	switch (INTEL_GEN(ce->engine->i915)) {
>-	case 7: {
>-		/*
>-		 * On Haswell we don't do any post processing of the reports
>-		 * and don't need to use the mask.
>-		 */
>-		stream->specific_ctx_id = i915_ggtt_offset(ce->state);
>-		stream->specific_ctx_id_mask = 0;
>-		break;
>-	}
>+	case 7:
>+		return i915_ggtt_offset(ce->state);
>
> 	case 8:
> 	case 9:
> 	case 10:
>-		if (intel_engine_in_execlists_submission_mode(ce->engine)) {
>-			stream->specific_ctx_id_mask =
>-				(1U << GEN8_CTX_ID_WIDTH) - 1;
>-			stream->specific_ctx_id = stream->specific_ctx_id_mask;
>-		} else {
>-			/*
>-			 * When using GuC, the context descriptor we write in
>-			 * i915 is read by GuC and rewritten before it's
>-			 * actually written into the hardware. The LRCA is
>-			 * what is put into the context id field of the
>-			 * context descriptor by GuC. Because it's aligned to
>-			 * a page, the lower 12bits are always at 0 and
>-			 * dropped by GuC. They won't be part of the context
>-			 * ID in the OA reports, so squash those lower bits.
>-			 */
>-			stream->specific_ctx_id =
>-				lower_32_bits(ce->lrc_desc) >> 12;
>+		if (intel_engine_in_execlists_submission_mode(ce->engine))
>+			return (1U << GEN8_CTX_ID_WIDTH) - 1 - idx;
>
>-			/*
>-			 * GuC uses the top bit to signal proxy submission, so
>-			 * ignore that bit.
>-			 */
>-			stream->specific_ctx_id_mask =
>-				(1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
>-		}
>-		break;
>+		/*
>+		 * When using GuC, the context descriptor we write in i915 is
>+		 * read by GuC and rewritten before it's actually written into
>+		 * the hardware. The LRCA is what is put into the context id
>+		 * field of the context descriptor by GuC. Because it's
>+		 * aligned to a page, the lower 12bits are always at 0 and
>+		 * dropped by GuC. They won't be part of the context ID in the
>+		 * OA reports, so squash those lower bits.
>+		 */
>+		return lower_32_bits(ce->lrc_desc) >> 12;
>
> 	case 11:
>-	case 12: {
>-		stream->specific_ctx_id_mask =
>-			((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
>+	case 12:
> 		/*
>-		 * Pick an unused context id
>-		 * 0 - (NUM_CONTEXT_TAG - 1) are used by other contexts
>-		 * GEN12_MAX_CONTEXT_HW_ID (0x7ff) is used by idle context
>+		 * Pick an unused context id 0 -
>+		 * (NUM_CONTEXT_TAG - 1) are used by other
>+		 * contexts GEN12_MAX_CONTEXT_HW_ID (0x7ff) is
>+		 * used by idle context
> 		 */
>-		stream->specific_ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
>-		BUILD_BUG_ON((GEN12_MAX_CONTEXT_HW_ID - 1) < NUM_CONTEXT_TAG);
>-		break;
>-	}
>+		return ((GEN12_MAX_CONTEXT_HW_ID - 1) - idx) << (GEN11_SW_CTX_ID_SHIFT - 32);
>
> 	default:
> 		MISSING_CASE(INTEL_GEN(ce->engine->i915));
>+		return 0;
> 	}
>-
>-	ce->tag = stream->specific_ctx_id;
>-
>-	drm_dbg(&stream->perf->i915->drm,
>-		"filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
>-		stream->specific_ctx_id,
>-		stream->specific_ctx_id_mask);
>-
>-	return 0;
> }
>
> /**
>- * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
>+ * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_ids releases hold
>  * @stream: An i915-perf stream opened for OA metrics
>  *
>  * In case anything needed doing to ensure the context HW ID would remain valid
>  * for the lifetime of the stream, then that can be undone here.
>  */
>-static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
>+static void oa_put_render_ctx_ids(struct i915_perf_stream *stream)
>+{
>+	int i;
>+
>+	for (i = 0; i < stream->n_pinned_ctxs; i++) {
>+		struct intel_context *ce;
>+
>+		ce = fetch_and_zero(&stream->pinned_ctxs[i].ce);
>+		if (ce) {
>+			ce->tag = 0; /* recomputed on next submission after parking */
>+			intel_context_unpin(ce);
>+		}
>+
>+		stream->pinned_ctxs[i].id = INVALID_CTX_ID;
>+	}
>+
>+	stream->ctx_id_mask = 0;
>+	stream->n_pinned_ctxs = 0;
>+
>+	kfree(stream->pinned_ctxs);
>+}
>+
>+static int oa_get_render_ctx_ids(struct i915_perf_stream *stream)
> {
> 	struct intel_context *ce;
>+	int i, err = 0;
>+	u32 n_allocated_ctxs = 0;
>
>-	ce = fetch_and_zero(&stream->pinned_ctx);
>-	if (ce) {
>-		ce->tag = 0; /* recomputed on next submission after parking */
>-		intel_context_unpin(ce);
>+	stream->ctx_id_mask = get_ctx_id_mask(stream->engine);
>+
>+	for (i = 0; i < stream->n_ctxs; i++) {
>+		struct i915_gem_context *ctx = stream->ctxs[i];
>+		struct i915_gem_engines_iter it;
>+
>+		for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
>+			if (ce->engine != stream->engine) /* first match! */
>+				continue;
>+
>+			/*
>+			 * As the ID is the gtt offset of the context's vma we
>+			 * pin the vma to ensure the ID remains fixed.
>+			 */
>+			err = intel_context_pin(ce);
>+			if (err) {
>+				i915_gem_context_unlock_engines(ctx);
>+				break;
>+			}
>+
>+			if (stream->n_pinned_ctxs >= n_allocated_ctxs) {
>+				u32 new_allocated_len = max(n_allocated_ctxs * 2, 2u);
>+				struct i915_perf_context_detail *new_ctxs =
>+					krealloc(stream->pinned_ctxs,
>+						 sizeof(*stream->pinned_ctxs) *
>+						 new_allocated_len,
>+						 GFP_KERNEL);
>+
>+				if (!new_ctxs) {
>+					err = -ENOMEM;
>+					break;
>+				}
>+
>+				n_allocated_ctxs = new_allocated_len;
>+				stream->pinned_ctxs = new_ctxs;
>+			}
>+
>+			stream->pinned_ctxs[stream->n_pinned_ctxs].ce = ce;
>+			stream->pinned_ctxs[stream->n_pinned_ctxs].id = get_ctx_id(ce, i);
>+
>+			drm_dbg(&stream->perf->i915->drm,
>+				"filtering on ctx_id%i=0x%x ctx_id_mask=0x%x\n",
>+				i, stream->pinned_ctxs[i].id, stream->ctx_id_mask);
>+
>+			ce->tag = stream->pinned_ctxs[stream->n_pinned_ctxs].id;
>+
>+			stream->n_pinned_ctxs++;
>+		}
>+		i915_gem_context_unlock_engines(ctx);
>+		if (err)
>+			goto err;
> 	}
>
>-	stream->specific_ctx_id = INVALID_CTX_ID;
>-	stream->specific_ctx_id_mask = 0;
>+	return 0;
>+
>+err:
>+	oa_put_render_ctx_ids(stream);
>+
>+	return err;
> }
>
> static void
>@@ -1399,8 +1463,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> 	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
> 	intel_engine_pm_put(stream->engine);
>
>-	if (stream->ctx)
>-		oa_put_render_ctx_id(stream);
>+	oa_put_render_ctx_ids(stream);
>
> 	free_oa_configs(stream);
> 	free_noa_wait(stream);
>@@ -1492,7 +1555,7 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
> 	 * reports we will forward to userspace while filtering for a single
> 	 * context.
> 	 */
>-	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
>+	stream->oa_buffer.last_ctx_match = false;
>
> 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
>
>@@ -1546,7 +1609,7 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> 	 * reports we will forward to userspace while filtering for a single
> 	 * context.
> 	 */
>-	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
>+	stream->oa_buffer.last_ctx_match = false;
>
> 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
>
>@@ -2262,11 +2325,10 @@ static int gen8_configure_context(struct i915_perf_stream *stream,
> 	return err;
> }
>
>-static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>-				       struct i915_active *active)
>+static int gen12_configure_oar_contexts(struct i915_perf_stream *stream,
>+					struct i915_active *active)
> {
>-	int err;
>-	struct intel_context *ce = stream->pinned_ctx;
>+	int i;
> 	u32 format = stream->oa_buffer.format;
> 	struct flex regs_context[] = {
> 		{
>@@ -2287,7 +2349,7 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
> 			(active ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0)
> 		},
> 		{
>-			RING_CONTEXT_CONTROL(ce->engine->mmio_base),
>+			RING_CONTEXT_CONTROL(stream->engine->mmio_base),
> 			CTX_CONTEXT_CONTROL,
> 			_MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE,
> 				      active ?
>@@ -2296,18 +2358,28 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
> 		},
> 	};
>
>-	/* Modify the context image of pinned context with regs_context*/
>-	err = intel_context_lock_pinned(ce);
>-	if (err)
>-		return err;
>+	for (i = 0; i < stream->n_pinned_ctxs; i++) {
>+		struct intel_context *ce = stream->pinned_ctxs[i].ce;
>+		int err;
>
>-	err = gen8_modify_context(stream, ce, regs_context, ARRAY_SIZE(regs_context));
>-	intel_context_unlock_pinned(ce);
>-	if (err)
>-		return err;
>+		/* Modify the context image of pinned context with regs_context*/
>+		err = intel_context_lock_pinned(ce);
>+		if (err)
>+			return err;
>+
>+		err = gen8_modify_context(stream, ce, regs_context, ARRAY_SIZE(regs_context));
>+		intel_context_unlock_pinned(ce);
>+		if (err)
>+			return err;
>+
>+		/* Apply regs_lri using LRI with pinned context */
>+		err = gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri),
>+				       active);
>+		if (err)
>+			return err;
>+	}
>
>-	/* Apply regs_lri using LRI with pinned context */
>-	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
>+	return 0;
> }
>
> /*
>@@ -2568,11 +2640,9 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> 	 * another set of performance registers. Configure the unit dealing
> 	 * with those.
> 	 */
>-	if (stream->ctx) {
>-		ret = gen12_configure_oar_context(stream, active);
>-		if (ret)
>-			return ret;
>-	}
>+	ret = gen12_configure_oar_contexts(stream, active);
>+	if (ret)
>+		return ret;
>
> 	ret = emit_oa_config(stream, oa_config,
> 			     stream->config_context,
>@@ -2619,8 +2689,7 @@ static void gen12_disable_metric_set(struct i915_perf_stream *stream,
> 	gen12_configure_all_contexts(stream, NULL, active);
>
> 	/* disable the context save/restore or OAR counters */
>-	if (stream->ctx)
>-		gen12_configure_oar_context(stream, active);
>+	gen12_configure_oar_contexts(stream, active);
>
> 	/* Make sure we disable noa to save power. */
> 	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
>@@ -2629,8 +2698,7 @@ static void gen12_disable_metric_set(struct i915_perf_stream *stream,
> static void gen7_oa_enable(struct i915_perf_stream *stream)
> {
> 	struct intel_uncore *uncore = stream->uncore;
>-	struct i915_gem_context *ctx = stream->ctx;
>-	u32 ctx_id = stream->specific_ctx_id;
>+	u32 ctx_id = stream->n_pinned_ctxs ? stream->pinned_ctxs[0].id : 0;
> 	bool periodic = stream->periodic;
> 	u32 period_exponent = stream->period_exponent;
> 	u32 report_format = stream->oa_buffer.format;
>@@ -2652,7 +2720,7 @@ static void gen7_oa_enable(struct i915_perf_stream *stream)
> 			    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> 			   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
> 			   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
>-			   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
>+			   (stream->n_ctxs ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> 			   GEN7_OACONTROL_ENABLE);
> }
>
>@@ -2869,7 +2937,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> 	}
>
> 	if (!(props->sample_flags & SAMPLE_OA_REPORT) &&
>-	    (INTEL_GEN(perf->i915) < 12 || !stream->ctx)) {
>+	    (INTEL_GEN(perf->i915) < 12 || !stream->n_ctxs)) {
> 		DRM_DEBUG("Only OA report sampling supported\n");
> 		return -EINVAL;
> 	}
>@@ -2917,12 +2985,10 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> 	if (stream->periodic)
> 		stream->period_exponent = props->oa_period_exponent;
>
>-	if (stream->ctx) {
>-		ret = oa_get_render_ctx_id(stream);
>-		if (ret) {
>-			DRM_DEBUG("Invalid context id to filter with\n");
>-			return ret;
>-		}
>+	ret = oa_get_render_ctx_ids(stream);
>+	if (ret) {
>+		DRM_DEBUG("Invalid context id to filter with\n");
>+		return ret;
> 	}
>
> 	ret = alloc_noa_wait(stream);
>@@ -3021,8 +3087,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> 	free_noa_wait(stream);
>
> err_noa_wait_alloc:
>-	if (stream->ctx)
>-		oa_put_render_ctx_id(stream);
>+	oa_put_render_ctx_ids(stream);
>
> 	return ret;
> }
>@@ -3215,8 +3280,12 @@ static void i915_perf_enable_locked(struct i915_perf_stream *stream)
> 	if (stream->ops->enable)
> 		stream->ops->enable(stream);
>
>-	if (stream->hold_preemption)
>-		intel_context_set_nopreempt(stream->pinned_ctx);
>+	if (stream->hold_preemption) {
>+		int i;
>+
>+		for (i = 0; i < stream->n_pinned_ctxs; i++)
>+			intel_context_set_nopreempt(stream->pinned_ctxs[i].ce);
>+	}
> }
>
> /**
>@@ -3241,8 +3310,12 @@ static void i915_perf_disable_locked(struct i915_perf_stream *stream)
> 	/* Allow stream->ops->disable() to refer to this */
> 	stream->enabled = false;
>
>-	if (stream->hold_preemption)
>-		intel_context_clear_nopreempt(stream->pinned_ctx);
>+	if (stream->hold_preemption) {
>+		int i;
>+
>+		for (i = 0; i < stream->n_pinned_ctxs; i++)
>+			intel_context_clear_nopreempt(stream->pinned_ctxs[i].ce);
>+	}
>
> 	if (stream->ops->disable)
> 		stream->ops->disable(stream);
>@@ -3260,7 +3333,7 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream,
> 		return -EINVAL;
>
> 	if (config != stream->oa_config) {
>-		struct intel_context *ce = stream->pinned_ctx ?: stream->config_context;
>+		int i;
>
> 		active = i915_active_create();
> 		if (!active) {
>@@ -3268,30 +3341,32 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream,
> 			goto err_config;
> 		}
>
>-		/*
>-		 * If OA is bound to a specific context, emit the
>-		 * reconfiguration inline from that context. The update
>-		 * will then be ordered with respect to submission on that
>-		 * context.
>-		 *
>-		 * When set globally, we use a low priority kernel context,
>-		 * so it will effectively take effect when idle.
>-		 */
>-		ret = emit_oa_config(stream, config, ce,
>+		ret = emit_oa_config(stream, config,
>+				     stream->config_context,
> 				     active, false /* global */);
> 		if (ret)
> 			goto err_active;
>
>-		ret = emit_oa_config(stream, config, ce,
>+		ret = emit_oa_config(stream, config,
>+				     stream->config_context,
> 				     active, true /* global */);

I see emit_oa_config is called twice in all places (except the for loop 
below). Maybe all those places, these 2 calls can be wrapped into one 
function.

Thanks,
Umesh

> 		if (ret)
> 			goto err_active;
>
>+		for (i = 0; i < stream->n_pinned_ctxs; i++) {
>+			ret = emit_oa_config(stream, config,
>+					     stream->pinned_ctxs[i].ce,
>+					     active, false /* global */);
>+			if (ret)
>+				goto err_active;
>+		}
>+
> 		config = xchg(&stream->oa_config, config);
> 	}
>
> err_active:
>-	i915_active_put(active);
>+	if (active)
>+		i915_active_put(active);
> err_config:
> 	i915_oa_config_put(config);
>
>@@ -3372,9 +3447,10 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
> 	if (stream->ops->destroy)
> 		stream->ops->destroy(stream);
>
>-	if (stream->ctx)
>-		i915_gem_context_put(stream->ctx);
>+	while (stream->n_ctxs--)
>+		i915_gem_context_put(stream->ctxs[stream->n_ctxs]);
>
>+	kfree(stream->ctxs);
> 	kfree(stream);
> }
>
>@@ -3449,25 +3525,12 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
> 			    struct perf_open_properties *props,
> 			    struct drm_file *file)
> {
>-	struct i915_gem_context *specific_ctx = NULL;
>+	struct drm_i915_file_private *file_priv = file->driver_priv;
> 	struct i915_perf_stream *stream = NULL;
> 	unsigned long f_flags = 0;
> 	bool privileged_op = true;
> 	int stream_fd;
>-	int ret;
>-
>-	if (props->single_context) {
>-		u32 ctx_handle = props->ctx_handle;
>-		struct drm_i915_file_private *file_priv = file->driver_priv;
>-
>-		specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle);
>-		if (!specific_ctx) {
>-			DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
>-				  ctx_handle);
>-			ret = -ENOENT;
>-			goto err;
>-		}
>-	}
>+	int i, ret;
>
> 	/*
> 	 * On Haswell the OA unit supports clock gating off for a specific
>@@ -3488,17 +3551,16 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
> 	 * doesn't request global stream access (i.e. query based sampling
> 	 * using MI_RECORD_PERF_COUNT.
> 	 */
>-	if (IS_HASWELL(perf->i915) && specific_ctx)
>+	if (IS_HASWELL(perf->i915) && props->n_ctx_handles > 0)
> 		privileged_op = false;
>-	else if (IS_GEN(perf->i915, 12) && specific_ctx &&
>+	else if (IS_GEN(perf->i915, 12) && (props->n_ctx_handles > 0) &&
> 		 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
> 		privileged_op = false;
>
> 	if (props->hold_preemption) {
>-		if (!props->single_context) {
>+		if (!props->n_ctx_handles) {
> 			DRM_DEBUG("preemption disable with no context\n");
>-			ret = -EINVAL;
>-			goto err;
>+			return -EINVAL;
> 		}
> 		privileged_op = true;
> 	}
>@@ -3519,23 +3581,43 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
> 	if (privileged_op &&
> 	    i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
> 		DRM_DEBUG("Insufficient privileges to open i915 perf stream\n");
>-		ret = -EACCES;
>-		goto err_ctx;
>+		return -EACCES;
> 	}
>
> 	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
>-	if (!stream) {
>-		ret = -ENOMEM;
>-		goto err_ctx;
>+	if (!stream)
>+		return -ENOMEM;
>+
>+	if (props->n_ctx_handles) {
>+		gfp_t alloc_flags = GFP_KERNEL | __GFP_ZERO;
>+
>+		stream->ctxs = kmalloc_array(props->n_ctx_handles,
>+					     sizeof(*stream->ctxs),
>+					     alloc_flags);
>+		if (!stream->ctxs)
>+			goto err_ctx;
> 	}
>
> 	stream->perf = perf;
>-	stream->ctx = specific_ctx;
> 	stream->poll_oa_period = props->poll_oa_period;
>
>+	for (i = 0; i < props->n_ctx_handles; i++) {
>+		stream->ctxs[i] = i915_gem_context_lookup(file_priv,
>+							  props->ctx_handles[i]);
>+		if (!stream->ctxs[i]) {
>+			DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
>+				  props->ctx_handles[i]);
>+
>+			ret = -ENOENT;
>+			goto err_ctx;
>+		}
>+
>+		stream->n_ctxs++;
>+	}
>+
> 	ret = i915_oa_stream_init(stream, param, props);
> 	if (ret)
>-		goto err_alloc;
>+		goto err_ctx;
>
> 	/* we avoid simply assigning stream->sample_flags = props->sample_flags
> 	 * to have _stream_init check the combination of sample flags more
>@@ -3570,12 +3652,11 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
> err_flags:
> 	if (stream->ops->destroy)
> 		stream->ops->destroy(stream);
>-err_alloc:
>-	kfree(stream);
> err_ctx:
>-	if (specific_ctx)
>-		i915_gem_context_put(specific_ctx);
>-err:
>+	while (stream->n_ctxs--)
>+		i915_gem_context_put(stream->ctxs[stream->n_ctxs]);
>+	kfree(stream->ctxs);
>+	kfree(stream);
> 	return ret;
> }
>
>@@ -3607,7 +3688,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
> {
> 	u64 __user *uprop = uprops;
> 	u32 i;
>-	int ret;
>+	int err;
>
> 	memset(props, 0, sizeof(struct perf_open_properties));
> 	props->poll_oa_period = DEFAULT_POLL_PERIOD_NS;
>@@ -3641,23 +3722,36 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 		u64 oa_period, oa_freq_hz;
> 		u64 id, value;
>
>-		ret = get_user(id, uprop);
>-		if (ret)
>-			return ret;
>+		err = get_user(id, uprop);
>+		if (err)
>+			goto error;
>
>-		ret = get_user(value, uprop + 1);
>-		if (ret)
>-			return ret;
>+		err = get_user(value, uprop + 1);
>+		if (err)
>+			goto error;
>
> 		if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
> 			DRM_DEBUG("Unknown i915 perf property ID\n");
>-			return -EINVAL;
>+			err = -EINVAL;
>+			goto error;
> 		}
>
> 		switch ((enum drm_i915_perf_property_id)id) {
> 		case DRM_I915_PERF_PROP_CTX_HANDLE:
>-			props->single_context = 1;
>-			props->ctx_handle = value;
>+			if (props->n_ctx_handles > 0) {
>+				DRM_DEBUG("Context handle specified multiple times\n");
>+				err = -EINVAL;
>+				goto error;
>+			}
>+			props->ctx_handles =
>+				kmalloc_array(1, sizeof(*props->ctx_handles),
>+					      GFP_KERNEL);
>+			if (!props->ctx_handles) {
>+				err = -ENOMEM;
>+				goto error;
>+			}
>+			props->ctx_handles[0] = value;
>+			props->n_ctx_handles = 1;
> 			break;
> 		case DRM_I915_PERF_PROP_SAMPLE_OA:
> 			if (value)
>@@ -3666,7 +3760,8 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 		case DRM_I915_PERF_PROP_OA_METRICS_SET:
> 			if (value == 0) {
> 				DRM_DEBUG("Unknown OA metric set ID\n");
>-				return -EINVAL;
>+				err = -EINVAL;
>+				goto error;
> 			}
> 			props->metrics_set = value;
> 			break;
>@@ -3674,12 +3769,14 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 			if (value == 0 || value >= I915_OA_FORMAT_MAX) {
> 				DRM_DEBUG("Out-of-range OA report format %llu\n",
> 					  value);
>-				return -EINVAL;
>+				err = -EINVAL;
>+				goto error;
> 			}
> 			if (!perf->oa_formats[value].size) {
> 				DRM_DEBUG("Unsupported OA report format %llu\n",
> 					  value);
>-				return -EINVAL;
>+				err = -EINVAL;
>+				goto error;
> 			}
> 			props->oa_format = value;
> 			break;
>@@ -3687,7 +3784,8 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 			if (value > OA_EXPONENT_MAX) {
> 				DRM_DEBUG("OA timer exponent too high (> %u)\n",
> 					 OA_EXPONENT_MAX);
>-				return -EINVAL;
>+				err = -EINVAL;
>+				goto error;
> 			}
>
> 			/* Theoretically we can program the OA unit to sample
>@@ -3716,7 +3814,8 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 			    !capable(CAP_SYS_ADMIN)) {
> 				DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n",
> 					  i915_oa_max_sample_rate);
>-				return -EACCES;
>+				err = -EACCES;
>+				goto error;
> 			}
>
> 			props->oa_periodic = true;
>@@ -3732,13 +3831,14 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 					   u64_to_user_ptr(value),
> 					   sizeof(user_sseu))) {
> 				DRM_DEBUG("Unable to copy global sseu parameter\n");
>-				return -EFAULT;
>+				err = -EFAULT;
>+				goto error;
> 			}
>
>-			ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
>-			if (ret) {
>+			err = get_sseu_config(&props->sseu, props->engine, &user_sseu);
>+			if (err) {
> 				DRM_DEBUG("Invalid SSEU configuration\n");
>-				return ret;
>+				goto error;
> 			}
> 			props->has_sseu = true;
> 			break;
>@@ -3747,19 +3847,25 @@ static int read_properties_unlocked(struct i915_perf *perf,
> 			if (value < 100000 /* 100us */) {
> 				DRM_DEBUG("OA availability timer too small (%lluns < 100us)\n",
> 					  value);
>-				return -EINVAL;
>+				err = -EINVAL;
>+				goto error;
> 			}
> 			props->poll_oa_period = value;
> 			break;
> 		case DRM_I915_PERF_PROP_MAX:
> 			MISSING_CASE(id);
>-			return -EINVAL;
>+			err = -EINVAL;
>+			goto error;
> 		}
>
> 		uprop += 2;
> 	}
>
> 	return 0;
>+
>+error:
>+	kfree(props->ctx_handles);
>+	return err;
> }
>
> /**
>@@ -3819,6 +3925,8 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
> 	ret = i915_perf_open_ioctl_locked(perf, param, &props, file);
> 	mutex_unlock(&perf->lock);
>
>+	kfree(props.ctx_handles);
>+
> 	return ret;
> }
>
>diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
>index a8b903592a39..278defe0b456 100644
>--- a/drivers/gpu/drm/i915/i915_perf_types.h
>+++ b/drivers/gpu/drm/i915/i915_perf_types.h
>@@ -161,10 +161,15 @@ struct i915_perf_stream {
> 	int sample_size;
>
> 	/**
>-	 * @ctx: %NULL if measuring system-wide across all contexts or a
>-	 * specific context that is being monitored.
>+	 * @n_ctxs: Number of contexts pinned for the recording.
> 	 */
>-	struct i915_gem_context *ctx;
>+	u32 n_ctxs;
>+
>+	/**
>+	 * @ctxs: All to %NULL if measuring system-wide across all contexts or
>+	 * a list specific contexts that are being monitored.
>+	 */
>+	struct i915_gem_context **ctxs;
>
> 	/**
> 	 * @enabled: Whether the stream is currently enabled, considering
>@@ -199,19 +204,31 @@ struct i915_perf_stream {
> 	struct llist_head oa_config_bos;
>
> 	/**
>-	 * @pinned_ctx: The OA context specific information.
>+	 * @pinned_ctxs: A array of logical context details needed for
>+	 * filtering and their associated pinned ID.
> 	 */
>-	struct intel_context *pinned_ctx;
>+	struct i915_perf_context_detail {
>+		/**
>+		 * @ce: The OA context specific information.
>+		 */
>+		struct intel_context *ce;
>+
>+		/**
>+		 * @id: The ids of the specific contexts.
>+		 */
>+		u32 id;
>+	} *pinned_ctxs;
>
> 	/**
>-	 * @specific_ctx_id: The id of the specific context.
>+	 * @n_pinned_ctxs: Length of the @pinned_ctxs array, 0 if measuring
>+	 * system-wide across all contexts.
> 	 */
>-	u32 specific_ctx_id;
>+	u32 n_pinned_ctxs;
>
> 	/**
>-	 * @specific_ctx_id_mask: The mask used to masking specific_ctx_id bits.
>+	 * @ctx_id_mask: The mask used to masking specific_ctx_id bits.
> 	 */
>-	u32 specific_ctx_id_mask;
>+	u32 ctx_id_mask;
>
> 	/**
> 	 * @poll_check_timer: High resolution timer that will periodically
>@@ -247,7 +264,7 @@ struct i915_perf_stream {
> 	struct {
> 		struct i915_vma *vma;
> 		u8 *vaddr;
>-		u32 last_ctx_id;
>+		bool last_ctx_match;
> 		int format;
> 		int format_size;
> 		int size_exponent;
>-- 
>2.26.0
>
>_______________________________________________
>Intel-gfx mailing list
>Intel-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Lionel Landwerlin April 14, 2020, 9:50 a.m. UTC | #2
On 14/04/2020 02:34, Umesh Nerlige Ramappa wrote:
>> -         * When set globally, we use a low priority kernel context,
>> -         * so it will effectively take effect when idle.
>> -         */
>> -        ret = emit_oa_config(stream, config, ce,
>> +        ret = emit_oa_config(stream, config,
>> +                     stream->config_context,
>>                      active, false /* global */);
>>         if (ret)
>>             goto err_active;
>>
>> -        ret = emit_oa_config(stream, config, ce,
>> +        ret = emit_oa_config(stream, config,
>> +                     stream->config_context,
>>                      active, true /* global */);
>
> I see emit_oa_config is called twice in all places (except the for 
> loop below). Maybe all those places, these 2 calls can be wrapped into 
> one function.
>
> Thanks,
> Umesh 

Good point, I'll add a helper.


Thanks,


-Lionel
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index db526e0c160a..543d29cd5c14 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -192,6 +192,7 @@ 
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/bsearch.h>
 #include <linux/sizes.h>
 #include <linux/uuid.h>
 
@@ -329,7 +330,8 @@  static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
  * @single_context: Whether a single or all gpu contexts should be monitored
  * @hold_preemption: Whether the preemption is disabled for the filtered
  *                   context
- * @ctx_handle: A gem ctx handle for use with @single_context
+ * @n_ctx_handles: Length of @ctx_handles
+ * @ctx_handles: An array of gem context handles
  * @metrics_set: An ID for an OA unit metric set advertised via sysfs
  * @oa_format: An OA unit HW report format
  * @oa_periodic: Whether to enable periodic OA unit sampling
@@ -349,9 +351,10 @@  static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
 struct perf_open_properties {
 	u32 sample_flags;
 
-	u64 single_context:1;
 	u64 hold_preemption:1;
-	u64 ctx_handle;
+
+	u32 n_ctx_handles;
+	u32 *ctx_handles;
 
 	/* OA sampling state */
 	int metrics_set;
@@ -625,6 +628,23 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 	return 0;
 }
 
+static int ctx_id_equal(const void *key, const void *elem)
+{
+	const struct i915_perf_context_detail *details = elem;
+
+	return ((int)details->id) - *((int *)key);
+}
+
+static inline bool ctx_id_match(struct i915_perf_stream *stream,
+				u32 masked_ctx_id)
+{
+	return bsearch(&masked_ctx_id,
+		       stream->pinned_ctxs,
+		       stream->n_pinned_ctxs,
+		       sizeof(*stream->pinned_ctxs),
+		       ctx_id_equal) != NULL;
+}
+
 /**
  * Copies all buffered OA reports into userspace read() buffer.
  * @stream: An i915-perf stream opened for OA metrics
@@ -736,7 +756,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		ctx_id = report32[2] & stream->specific_ctx_id_mask;
+		ctx_id = report32[2] & stream->ctx_id_mask;
 
 		/*
 		 * Squash whatever is in the CTX_ID field if it's marked as
@@ -781,26 +801,32 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * switches since it's not-uncommon for periodic samples to
 		 * identify a switch before any 'context switch' report.
 		 */
-		if (!stream->perf->exclusive_stream->ctx ||
-		    stream->specific_ctx_id == ctx_id ||
-		    stream->oa_buffer.last_ctx_id == stream->specific_ctx_id ||
-		    reason & OAREPORT_REASON_CTX_SWITCH) {
-
-			/*
-			 * While filtering for a single context we avoid
-			 * leaking the IDs of other contexts.
-			 */
-			if (stream->perf->exclusive_stream->ctx &&
-			    stream->specific_ctx_id != ctx_id) {
-				report32[2] = INVALID_CTX_ID;
-			}
-
+		if (!stream->perf->exclusive_stream->n_ctxs) {
 			ret = append_oa_sample(stream, buf, count, offset,
 					       report);
 			if (ret)
 				break;
+		} else {
+			bool ctx_match = ctx_id != INVALID_CTX_ID &&
+				ctx_id_match(stream, ctx_id);
+
+			if (ctx_match ||
+			    stream->oa_buffer.last_ctx_match ||
+			    reason & OAREPORT_REASON_CTX_SWITCH) {
+				/*
+				 * While filtering for a single context we avoid
+				 * leaking the IDs of other contexts.
+				 */
+				if (!ctx_match)
+					report32[2] = INVALID_CTX_ID;
+
+				ret = append_oa_sample(stream, buf, count, offset,
+						       report);
+				if (ret)
+					break;
+			}
 
-			stream->oa_buffer.last_ctx_id = ctx_id;
+			stream->oa_buffer.last_ctx_match = ctx_match;
 		}
 
 		/*
@@ -1191,138 +1217,176 @@  static int i915_oa_read(struct i915_perf_stream *stream,
 	return stream->perf->ops.read(stream, buf, count, offset);
 }
 
-static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
+static u32 get_ctx_id_mask(struct intel_engine_cs *engine)
 {
-	struct i915_gem_engines_iter it;
-	struct i915_gem_context *ctx = stream->ctx;
-	struct intel_context *ce;
-	int err;
+	switch (INTEL_GEN(engine->i915)) {
+	case 7:
+		/*
+		 * On Haswell we don't do any post processing of the reports
+		 * and don't need to use the mask.
+		 */
+		return 0;
 
-	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
-		if (ce->engine != stream->engine) /* first match! */
-			continue;
+	case 8:
+	case 9:
+	case 10:
+		if (intel_engine_in_execlists_submission_mode(engine))
+			return (1U << GEN8_CTX_ID_WIDTH) - 1;
 
 		/*
-		 * As the ID is the gtt offset of the context's vma we
-		 * pin the vma to ensure the ID remains fixed.
+		 * GuC uses the top bit to signal proxy submission, so ignore
+		 * that bit.
 		 */
-		err = intel_context_pin(ce);
-		if (err == 0) {
-			stream->pinned_ctx = ce;
-			break;
-		}
-	}
-	i915_gem_context_unlock_engines(ctx);
+		return (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
 
-	return stream->pinned_ctx;
+	case 11:
+	case 12:
+		/*
+		 * 0x7ff is used by idle context.
+		 */
+		BUILD_BUG_ON((GEN12_MAX_CONTEXT_HW_ID - 1) < NUM_CONTEXT_TAG);
+		return ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
+
+	default:
+		MISSING_CASE(INTEL_GEN(engine->i915));
+		return 0;
+	}
 }
 
-/**
- * oa_get_render_ctx_id - determine and hold ctx hw id
- * @stream: An i915-perf stream opened for OA metrics
- *
- * Determine the render context hw id, and ensure it remains fixed for the
- * lifetime of the stream. This ensures that we don't have to worry about
- * updating the context ID in OACONTROL on the fly.
- *
- * Returns: zero on success or a negative error code
- */
-static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
+static u32 get_ctx_id(struct intel_context *ce, int idx)
 {
-	struct intel_context *ce;
-
-	ce = oa_pin_context(stream);
-	if (IS_ERR(ce))
-		return PTR_ERR(ce);
 
 	switch (INTEL_GEN(ce->engine->i915)) {
-	case 7: {
-		/*
-		 * On Haswell we don't do any post processing of the reports
-		 * and don't need to use the mask.
-		 */
-		stream->specific_ctx_id = i915_ggtt_offset(ce->state);
-		stream->specific_ctx_id_mask = 0;
-		break;
-	}
+	case 7:
+		return i915_ggtt_offset(ce->state);
 
 	case 8:
 	case 9:
 	case 10:
-		if (intel_engine_in_execlists_submission_mode(ce->engine)) {
-			stream->specific_ctx_id_mask =
-				(1U << GEN8_CTX_ID_WIDTH) - 1;
-			stream->specific_ctx_id = stream->specific_ctx_id_mask;
-		} else {
-			/*
-			 * When using GuC, the context descriptor we write in
-			 * i915 is read by GuC and rewritten before it's
-			 * actually written into the hardware. The LRCA is
-			 * what is put into the context id field of the
-			 * context descriptor by GuC. Because it's aligned to
-			 * a page, the lower 12bits are always at 0 and
-			 * dropped by GuC. They won't be part of the context
-			 * ID in the OA reports, so squash those lower bits.
-			 */
-			stream->specific_ctx_id =
-				lower_32_bits(ce->lrc_desc) >> 12;
+		if (intel_engine_in_execlists_submission_mode(ce->engine))
+			return (1U << GEN8_CTX_ID_WIDTH) - 1 - idx;
 
-			/*
-			 * GuC uses the top bit to signal proxy submission, so
-			 * ignore that bit.
-			 */
-			stream->specific_ctx_id_mask =
-				(1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
-		}
-		break;
+		/*
+		 * When using GuC, the context descriptor we write in i915 is
+		 * read by GuC and rewritten before it's actually written into
+		 * the hardware. The LRCA is what is put into the context id
+		 * field of the context descriptor by GuC. Because it's
+		 * aligned to a page, the lower 12bits are always at 0 and
+		 * dropped by GuC. They won't be part of the context ID in the
+		 * OA reports, so squash those lower bits.
+		 */
+		return lower_32_bits(ce->lrc_desc) >> 12;
 
 	case 11:
-	case 12: {
-		stream->specific_ctx_id_mask =
-			((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
+	case 12:
 		/*
-		 * Pick an unused context id
-		 * 0 - (NUM_CONTEXT_TAG - 1) are used by other contexts
-		 * GEN12_MAX_CONTEXT_HW_ID (0x7ff) is used by idle context
+		 * Pick an unused context id 0 -
+		 * (NUM_CONTEXT_TAG - 1) are used by other
+		 * contexts GEN12_MAX_CONTEXT_HW_ID (0x7ff) is
+		 * used by idle context
 		 */
-		stream->specific_ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
-		BUILD_BUG_ON((GEN12_MAX_CONTEXT_HW_ID - 1) < NUM_CONTEXT_TAG);
-		break;
-	}
+		return ((GEN12_MAX_CONTEXT_HW_ID - 1) - idx) << (GEN11_SW_CTX_ID_SHIFT - 32);
 
 	default:
 		MISSING_CASE(INTEL_GEN(ce->engine->i915));
+		return 0;
 	}
-
-	ce->tag = stream->specific_ctx_id;
-
-	drm_dbg(&stream->perf->i915->drm,
-		"filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
-		stream->specific_ctx_id,
-		stream->specific_ctx_id_mask);
-
-	return 0;
 }
 
 /**
- * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
+ * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_ids releases hold
  * @stream: An i915-perf stream opened for OA metrics
  *
  * In case anything needed doing to ensure the context HW ID would remain valid
  * for the lifetime of the stream, then that can be undone here.
  */
-static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
+static void oa_put_render_ctx_ids(struct i915_perf_stream *stream)
+{
+	int i;
+
+	for (i = 0; i < stream->n_pinned_ctxs; i++) {
+		struct intel_context *ce;
+
+		ce = fetch_and_zero(&stream->pinned_ctxs[i].ce);
+		if (ce) {
+			ce->tag = 0; /* recomputed on next submission after parking */
+			intel_context_unpin(ce);
+		}
+
+		stream->pinned_ctxs[i].id = INVALID_CTX_ID;
+	}
+
+	stream->ctx_id_mask = 0;
+	stream->n_pinned_ctxs = 0;
+
+	kfree(stream->pinned_ctxs);
+}
+
+static int oa_get_render_ctx_ids(struct i915_perf_stream *stream)
 {
 	struct intel_context *ce;
+	int i, err = 0;
+	u32 n_allocated_ctxs = 0;
 
-	ce = fetch_and_zero(&stream->pinned_ctx);
-	if (ce) {
-		ce->tag = 0; /* recomputed on next submission after parking */
-		intel_context_unpin(ce);
+	stream->ctx_id_mask = get_ctx_id_mask(stream->engine);
+
+	for (i = 0; i < stream->n_ctxs; i++) {
+		struct i915_gem_context *ctx = stream->ctxs[i];
+		struct i915_gem_engines_iter it;
+
+		for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
+			if (ce->engine != stream->engine) /* first match! */
+				continue;
+
+			/*
+			 * As the ID is the gtt offset of the context's vma we
+			 * pin the vma to ensure the ID remains fixed.
+			 */
+			err = intel_context_pin(ce);
+			if (err) {
+				i915_gem_context_unlock_engines(ctx);
+				break;
+			}
+
+			if (stream->n_pinned_ctxs >= n_allocated_ctxs) {
+				u32 new_allocated_len = max(n_allocated_ctxs * 2, 2u);
+				struct i915_perf_context_detail *new_ctxs =
+					krealloc(stream->pinned_ctxs,
+						 sizeof(*stream->pinned_ctxs) *
+						 new_allocated_len,
+						 GFP_KERNEL);
+
+				if (!new_ctxs) {
+					err = -ENOMEM;
+					break;
+				}
+
+				n_allocated_ctxs = new_allocated_len;
+				stream->pinned_ctxs = new_ctxs;
+			}
+
+			stream->pinned_ctxs[stream->n_pinned_ctxs].ce = ce;
+			stream->pinned_ctxs[stream->n_pinned_ctxs].id = get_ctx_id(ce, i);
+
+			drm_dbg(&stream->perf->i915->drm,
+				"filtering on ctx_id%i=0x%x ctx_id_mask=0x%x\n",
+				i, stream->pinned_ctxs[i].id, stream->ctx_id_mask);
+
+			ce->tag = stream->pinned_ctxs[stream->n_pinned_ctxs].id;
+
+			stream->n_pinned_ctxs++;
+		}
+		i915_gem_context_unlock_engines(ctx);
+		if (err)
+			goto err;
 	}
 
-	stream->specific_ctx_id = INVALID_CTX_ID;
-	stream->specific_ctx_id_mask = 0;
+	return 0;
+
+err:
+	oa_put_render_ctx_ids(stream);
+
+	return err;
 }
 
 static void
@@ -1399,8 +1463,7 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 	intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
 	intel_engine_pm_put(stream->engine);
 
-	if (stream->ctx)
-		oa_put_render_ctx_id(stream);
+	oa_put_render_ctx_ids(stream);
 
 	free_oa_configs(stream);
 	free_noa_wait(stream);
@@ -1492,7 +1555,7 @@  static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
 	 * reports we will forward to userspace while filtering for a single
 	 * context.
 	 */
-	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
+	stream->oa_buffer.last_ctx_match = false;
 
 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 
@@ -1546,7 +1609,7 @@  static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 	 * reports we will forward to userspace while filtering for a single
 	 * context.
 	 */
-	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
+	stream->oa_buffer.last_ctx_match = false;
 
 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 
@@ -2262,11 +2325,10 @@  static int gen8_configure_context(struct i915_perf_stream *stream,
 	return err;
 }
 
-static int gen12_configure_oar_context(struct i915_perf_stream *stream,
-				       struct i915_active *active)
+static int gen12_configure_oar_contexts(struct i915_perf_stream *stream,
+					struct i915_active *active)
 {
-	int err;
-	struct intel_context *ce = stream->pinned_ctx;
+	int i;
 	u32 format = stream->oa_buffer.format;
 	struct flex regs_context[] = {
 		{
@@ -2287,7 +2349,7 @@  static int gen12_configure_oar_context(struct i915_perf_stream *stream,
 			(active ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0)
 		},
 		{
-			RING_CONTEXT_CONTROL(ce->engine->mmio_base),
+			RING_CONTEXT_CONTROL(stream->engine->mmio_base),
 			CTX_CONTEXT_CONTROL,
 			_MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE,
 				      active ?
@@ -2296,18 +2358,28 @@  static int gen12_configure_oar_context(struct i915_perf_stream *stream,
 		},
 	};
 
-	/* Modify the context image of pinned context with regs_context*/
-	err = intel_context_lock_pinned(ce);
-	if (err)
-		return err;
+	for (i = 0; i < stream->n_pinned_ctxs; i++) {
+		struct intel_context *ce = stream->pinned_ctxs[i].ce;
+		int err;
 
-	err = gen8_modify_context(stream, ce, regs_context, ARRAY_SIZE(regs_context));
-	intel_context_unlock_pinned(ce);
-	if (err)
-		return err;
+		/* Modify the context image of pinned context with regs_context*/
+		err = intel_context_lock_pinned(ce);
+		if (err)
+			return err;
+
+		err = gen8_modify_context(stream, ce, regs_context, ARRAY_SIZE(regs_context));
+		intel_context_unlock_pinned(ce);
+		if (err)
+			return err;
+
+		/* Apply regs_lri using LRI with pinned context */
+		err = gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri),
+				       active);
+		if (err)
+			return err;
+	}
 
-	/* Apply regs_lri using LRI with pinned context */
-	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
+	return 0;
 }
 
 /*
@@ -2568,11 +2640,9 @@  gen12_enable_metric_set(struct i915_perf_stream *stream,
 	 * another set of performance registers. Configure the unit dealing
 	 * with those.
 	 */
-	if (stream->ctx) {
-		ret = gen12_configure_oar_context(stream, active);
-		if (ret)
-			return ret;
-	}
+	ret = gen12_configure_oar_contexts(stream, active);
+	if (ret)
+		return ret;
 
 	ret = emit_oa_config(stream, oa_config,
 			     stream->config_context,
@@ -2619,8 +2689,7 @@  static void gen12_disable_metric_set(struct i915_perf_stream *stream,
 	gen12_configure_all_contexts(stream, NULL, active);
 
 	/* disable the context save/restore or OAR counters */
-	if (stream->ctx)
-		gen12_configure_oar_context(stream, active);
+	gen12_configure_oar_contexts(stream, active);
 
 	/* Make sure we disable noa to save power. */
 	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
@@ -2629,8 +2698,7 @@  static void gen12_disable_metric_set(struct i915_perf_stream *stream,
 static void gen7_oa_enable(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
-	struct i915_gem_context *ctx = stream->ctx;
-	u32 ctx_id = stream->specific_ctx_id;
+	u32 ctx_id = stream->n_pinned_ctxs ? stream->pinned_ctxs[0].id : 0;
 	bool periodic = stream->periodic;
 	u32 period_exponent = stream->period_exponent;
 	u32 report_format = stream->oa_buffer.format;
@@ -2652,7 +2720,7 @@  static void gen7_oa_enable(struct i915_perf_stream *stream)
 			    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
 			   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
 			   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
-			   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
+			   (stream->n_ctxs ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
 			   GEN7_OACONTROL_ENABLE);
 }
 
@@ -2869,7 +2937,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	}
 
 	if (!(props->sample_flags & SAMPLE_OA_REPORT) &&
-	    (INTEL_GEN(perf->i915) < 12 || !stream->ctx)) {
+	    (INTEL_GEN(perf->i915) < 12 || !stream->n_ctxs)) {
 		DRM_DEBUG("Only OA report sampling supported\n");
 		return -EINVAL;
 	}
@@ -2917,12 +2985,10 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (stream->periodic)
 		stream->period_exponent = props->oa_period_exponent;
 
-	if (stream->ctx) {
-		ret = oa_get_render_ctx_id(stream);
-		if (ret) {
-			DRM_DEBUG("Invalid context id to filter with\n");
-			return ret;
-		}
+	ret = oa_get_render_ctx_ids(stream);
+	if (ret) {
+		DRM_DEBUG("Invalid context id to filter with\n");
+		return ret;
 	}
 
 	ret = alloc_noa_wait(stream);
@@ -3021,8 +3087,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	free_noa_wait(stream);
 
 err_noa_wait_alloc:
-	if (stream->ctx)
-		oa_put_render_ctx_id(stream);
+	oa_put_render_ctx_ids(stream);
 
 	return ret;
 }
@@ -3215,8 +3280,12 @@  static void i915_perf_enable_locked(struct i915_perf_stream *stream)
 	if (stream->ops->enable)
 		stream->ops->enable(stream);
 
-	if (stream->hold_preemption)
-		intel_context_set_nopreempt(stream->pinned_ctx);
+	if (stream->hold_preemption) {
+		int i;
+
+		for (i = 0; i < stream->n_pinned_ctxs; i++)
+			intel_context_set_nopreempt(stream->pinned_ctxs[i].ce);
+	}
 }
 
 /**
@@ -3241,8 +3310,12 @@  static void i915_perf_disable_locked(struct i915_perf_stream *stream)
 	/* Allow stream->ops->disable() to refer to this */
 	stream->enabled = false;
 
-	if (stream->hold_preemption)
-		intel_context_clear_nopreempt(stream->pinned_ctx);
+	if (stream->hold_preemption) {
+		int i;
+
+		for (i = 0; i < stream->n_pinned_ctxs; i++)
+			intel_context_clear_nopreempt(stream->pinned_ctxs[i].ce);
+	}
 
 	if (stream->ops->disable)
 		stream->ops->disable(stream);
@@ -3260,7 +3333,7 @@  static long i915_perf_config_locked(struct i915_perf_stream *stream,
 		return -EINVAL;
 
 	if (config != stream->oa_config) {
-		struct intel_context *ce = stream->pinned_ctx ?: stream->config_context;
+		int i;
 
 		active = i915_active_create();
 		if (!active) {
@@ -3268,30 +3341,32 @@  static long i915_perf_config_locked(struct i915_perf_stream *stream,
 			goto err_config;
 		}
 
-		/*
-		 * If OA is bound to a specific context, emit the
-		 * reconfiguration inline from that context. The update
-		 * will then be ordered with respect to submission on that
-		 * context.
-		 *
-		 * When set globally, we use a low priority kernel context,
-		 * so it will effectively take effect when idle.
-		 */
-		ret = emit_oa_config(stream, config, ce,
+		ret = emit_oa_config(stream, config,
+				     stream->config_context,
 				     active, false /* global */);
 		if (ret)
 			goto err_active;
 
-		ret = emit_oa_config(stream, config, ce,
+		ret = emit_oa_config(stream, config,
+				     stream->config_context,
 				     active, true /* global */);
 		if (ret)
 			goto err_active;
 
+		for (i = 0; i < stream->n_pinned_ctxs; i++) {
+			ret = emit_oa_config(stream, config,
+					     stream->pinned_ctxs[i].ce,
+					     active, false /* global */);
+			if (ret)
+				goto err_active;
+		}
+
 		config = xchg(&stream->oa_config, config);
 	}
 
 err_active:
-	i915_active_put(active);
+	if (active)
+		i915_active_put(active);
 err_config:
 	i915_oa_config_put(config);
 
@@ -3372,9 +3447,10 @@  static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 
-	if (stream->ctx)
-		i915_gem_context_put(stream->ctx);
+	while (stream->n_ctxs--)
+		i915_gem_context_put(stream->ctxs[stream->n_ctxs]);
 
+	kfree(stream->ctxs);
 	kfree(stream);
 }
 
@@ -3449,25 +3525,12 @@  i915_perf_open_ioctl_locked(struct i915_perf *perf,
 			    struct perf_open_properties *props,
 			    struct drm_file *file)
 {
-	struct i915_gem_context *specific_ctx = NULL;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_perf_stream *stream = NULL;
 	unsigned long f_flags = 0;
 	bool privileged_op = true;
 	int stream_fd;
-	int ret;
-
-	if (props->single_context) {
-		u32 ctx_handle = props->ctx_handle;
-		struct drm_i915_file_private *file_priv = file->driver_priv;
-
-		specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle);
-		if (!specific_ctx) {
-			DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
-				  ctx_handle);
-			ret = -ENOENT;
-			goto err;
-		}
-	}
+	int i, ret;
 
 	/*
 	 * On Haswell the OA unit supports clock gating off for a specific
@@ -3488,17 +3551,16 @@  i915_perf_open_ioctl_locked(struct i915_perf *perf,
 	 * doesn't request global stream access (i.e. query based sampling
 	 * using MI_RECORD_PERF_COUNT.
 	 */
-	if (IS_HASWELL(perf->i915) && specific_ctx)
+	if (IS_HASWELL(perf->i915) && props->n_ctx_handles > 0)
 		privileged_op = false;
-	else if (IS_GEN(perf->i915, 12) && specific_ctx &&
+	else if (IS_GEN(perf->i915, 12) && (props->n_ctx_handles > 0) &&
 		 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
 		privileged_op = false;
 
 	if (props->hold_preemption) {
-		if (!props->single_context) {
+		if (!props->n_ctx_handles) {
 			DRM_DEBUG("preemption disable with no context\n");
-			ret = -EINVAL;
-			goto err;
+			return -EINVAL;
 		}
 		privileged_op = true;
 	}
@@ -3519,23 +3581,43 @@  i915_perf_open_ioctl_locked(struct i915_perf *perf,
 	if (privileged_op &&
 	    i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
 		DRM_DEBUG("Insufficient privileges to open i915 perf stream\n");
-		ret = -EACCES;
-		goto err_ctx;
+		return -EACCES;
 	}
 
 	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
-	if (!stream) {
-		ret = -ENOMEM;
-		goto err_ctx;
+	if (!stream)
+		return -ENOMEM;
+
+	if (props->n_ctx_handles) {
+		gfp_t alloc_flags = GFP_KERNEL | __GFP_ZERO;
+
+		stream->ctxs = kmalloc_array(props->n_ctx_handles,
+					     sizeof(*stream->ctxs),
+					     alloc_flags);
+		if (!stream->ctxs)
+			goto err_ctx;
 	}
 
 	stream->perf = perf;
-	stream->ctx = specific_ctx;
 	stream->poll_oa_period = props->poll_oa_period;
 
+	for (i = 0; i < props->n_ctx_handles; i++) {
+		stream->ctxs[i] = i915_gem_context_lookup(file_priv,
+							  props->ctx_handles[i]);
+		if (!stream->ctxs[i]) {
+			DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
+				  props->ctx_handles[i]);
+
+			ret = -ENOENT;
+			goto err_ctx;
+		}
+
+		stream->n_ctxs++;
+	}
+
 	ret = i915_oa_stream_init(stream, param, props);
 	if (ret)
-		goto err_alloc;
+		goto err_ctx;
 
 	/* we avoid simply assigning stream->sample_flags = props->sample_flags
 	 * to have _stream_init check the combination of sample flags more
@@ -3570,12 +3652,11 @@  i915_perf_open_ioctl_locked(struct i915_perf *perf,
 err_flags:
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
-err_alloc:
-	kfree(stream);
 err_ctx:
-	if (specific_ctx)
-		i915_gem_context_put(specific_ctx);
-err:
+	while (stream->n_ctxs--)
+		i915_gem_context_put(stream->ctxs[stream->n_ctxs]);
+	kfree(stream->ctxs);
+	kfree(stream);
 	return ret;
 }
 
@@ -3607,7 +3688,7 @@  static int read_properties_unlocked(struct i915_perf *perf,
 {
 	u64 __user *uprop = uprops;
 	u32 i;
-	int ret;
+	int err;
 
 	memset(props, 0, sizeof(struct perf_open_properties));
 	props->poll_oa_period = DEFAULT_POLL_PERIOD_NS;
@@ -3641,23 +3722,36 @@  static int read_properties_unlocked(struct i915_perf *perf,
 		u64 oa_period, oa_freq_hz;
 		u64 id, value;
 
-		ret = get_user(id, uprop);
-		if (ret)
-			return ret;
+		err = get_user(id, uprop);
+		if (err)
+			goto error;
 
-		ret = get_user(value, uprop + 1);
-		if (ret)
-			return ret;
+		err = get_user(value, uprop + 1);
+		if (err)
+			goto error;
 
 		if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
 			DRM_DEBUG("Unknown i915 perf property ID\n");
-			return -EINVAL;
+			err = -EINVAL;
+			goto error;
 		}
 
 		switch ((enum drm_i915_perf_property_id)id) {
 		case DRM_I915_PERF_PROP_CTX_HANDLE:
-			props->single_context = 1;
-			props->ctx_handle = value;
+			if (props->n_ctx_handles > 0) {
+				DRM_DEBUG("Context handle specified multiple times\n");
+				err = -EINVAL;
+				goto error;
+			}
+			props->ctx_handles =
+				kmalloc_array(1, sizeof(*props->ctx_handles),
+					      GFP_KERNEL);
+			if (!props->ctx_handles) {
+				err = -ENOMEM;
+				goto error;
+			}
+			props->ctx_handles[0] = value;
+			props->n_ctx_handles = 1;
 			break;
 		case DRM_I915_PERF_PROP_SAMPLE_OA:
 			if (value)
@@ -3666,7 +3760,8 @@  static int read_properties_unlocked(struct i915_perf *perf,
 		case DRM_I915_PERF_PROP_OA_METRICS_SET:
 			if (value == 0) {
 				DRM_DEBUG("Unknown OA metric set ID\n");
-				return -EINVAL;
+				err = -EINVAL;
+				goto error;
 			}
 			props->metrics_set = value;
 			break;
@@ -3674,12 +3769,14 @@  static int read_properties_unlocked(struct i915_perf *perf,
 			if (value == 0 || value >= I915_OA_FORMAT_MAX) {
 				DRM_DEBUG("Out-of-range OA report format %llu\n",
 					  value);
-				return -EINVAL;
+				err = -EINVAL;
+				goto error;
 			}
 			if (!perf->oa_formats[value].size) {
 				DRM_DEBUG("Unsupported OA report format %llu\n",
 					  value);
-				return -EINVAL;
+				err = -EINVAL;
+				goto error;
 			}
 			props->oa_format = value;
 			break;
@@ -3687,7 +3784,8 @@  static int read_properties_unlocked(struct i915_perf *perf,
 			if (value > OA_EXPONENT_MAX) {
 				DRM_DEBUG("OA timer exponent too high (> %u)\n",
 					 OA_EXPONENT_MAX);
-				return -EINVAL;
+				err = -EINVAL;
+				goto error;
 			}
 
 			/* Theoretically we can program the OA unit to sample
@@ -3716,7 +3814,8 @@  static int read_properties_unlocked(struct i915_perf *perf,
 			    !capable(CAP_SYS_ADMIN)) {
 				DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n",
 					  i915_oa_max_sample_rate);
-				return -EACCES;
+				err = -EACCES;
+				goto error;
 			}
 
 			props->oa_periodic = true;
@@ -3732,13 +3831,14 @@  static int read_properties_unlocked(struct i915_perf *perf,
 					   u64_to_user_ptr(value),
 					   sizeof(user_sseu))) {
 				DRM_DEBUG("Unable to copy global sseu parameter\n");
-				return -EFAULT;
+				err = -EFAULT;
+				goto error;
 			}
 
-			ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
-			if (ret) {
+			err = get_sseu_config(&props->sseu, props->engine, &user_sseu);
+			if (err) {
 				DRM_DEBUG("Invalid SSEU configuration\n");
-				return ret;
+				goto error;
 			}
 			props->has_sseu = true;
 			break;
@@ -3747,19 +3847,25 @@  static int read_properties_unlocked(struct i915_perf *perf,
 			if (value < 100000 /* 100us */) {
 				DRM_DEBUG("OA availability timer too small (%lluns < 100us)\n",
 					  value);
-				return -EINVAL;
+				err = -EINVAL;
+				goto error;
 			}
 			props->poll_oa_period = value;
 			break;
 		case DRM_I915_PERF_PROP_MAX:
 			MISSING_CASE(id);
-			return -EINVAL;
+			err = -EINVAL;
+			goto error;
 		}
 
 		uprop += 2;
 	}
 
 	return 0;
+
+error:
+	kfree(props->ctx_handles);
+	return err;
 }
 
 /**
@@ -3819,6 +3925,8 @@  int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 	ret = i915_perf_open_ioctl_locked(perf, param, &props, file);
 	mutex_unlock(&perf->lock);
 
+	kfree(props.ctx_handles);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
index a8b903592a39..278defe0b456 100644
--- a/drivers/gpu/drm/i915/i915_perf_types.h
+++ b/drivers/gpu/drm/i915/i915_perf_types.h
@@ -161,10 +161,15 @@  struct i915_perf_stream {
 	int sample_size;
 
 	/**
-	 * @ctx: %NULL if measuring system-wide across all contexts or a
-	 * specific context that is being monitored.
+	 * @n_ctxs: Number of contexts pinned for the recording.
 	 */
-	struct i915_gem_context *ctx;
+	u32 n_ctxs;
+
+	/**
+	 * @ctxs: All to %NULL if measuring system-wide across all contexts or
+	 * a list specific contexts that are being monitored.
+	 */
+	struct i915_gem_context **ctxs;
 
 	/**
 	 * @enabled: Whether the stream is currently enabled, considering
@@ -199,19 +204,31 @@  struct i915_perf_stream {
 	struct llist_head oa_config_bos;
 
 	/**
-	 * @pinned_ctx: The OA context specific information.
+	 * @pinned_ctxs: A array of logical context details needed for
+	 * filtering and their associated pinned ID.
 	 */
-	struct intel_context *pinned_ctx;
+	struct i915_perf_context_detail {
+		/**
+		 * @ce: The OA context specific information.
+		 */
+		struct intel_context *ce;
+
+		/**
+		 * @id: The ids of the specific contexts.
+		 */
+		u32 id;
+	} *pinned_ctxs;
 
 	/**
-	 * @specific_ctx_id: The id of the specific context.
+	 * @n_pinned_ctxs: Length of the @pinned_ctxs array, 0 if measuring
+	 * system-wide across all contexts.
 	 */
-	u32 specific_ctx_id;
+	u32 n_pinned_ctxs;
 
 	/**
-	 * @specific_ctx_id_mask: The mask used to masking specific_ctx_id bits.
+	 * @ctx_id_mask: The mask used to masking specific_ctx_id bits.
 	 */
-	u32 specific_ctx_id_mask;
+	u32 ctx_id_mask;
 
 	/**
 	 * @poll_check_timer: High resolution timer that will periodically
@@ -247,7 +264,7 @@  struct i915_perf_stream {
 	struct {
 		struct i915_vma *vma;
 		u8 *vaddr;
-		u32 last_ctx_id;
+		bool last_ctx_match;
 		int format;
 		int format_size;
 		int size_exponent;