diff mbox

[2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

Message ID 20180531195637.11087-3-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin May 31, 2018, 7:56 p.m. UTC
One thing we didn't really understand about the OA report is that the
ContextID field (dword 2) is copy of the context descriptor (dword 1).

On Gen8->10 and without using GuC we didn't notice the issue because
we only checked the 21bits of the ContextID field in the OA reports
which matches exactly the hw_id stored into the context descriptor.

When using GuC submission we have an issue of a non matching hw_id
because GuC uses bit 20 of the hw_id to signal proxy submission. This
change makes introduces a mask to compare only the relevant bits.

On ICL the context descriptor format has changed and we failed to
address this. On top of using a mask we also need to shift the bits
properly.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
BSpec: 1237
Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
---
 drivers/gpu/drm/i915/i915_drv.h  |   1 +
 drivers/gpu/drm/i915/i915_perf.c | 123 ++++++++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_lrc.c |   5 ++
 3 files changed, 101 insertions(+), 28 deletions(-)

Comments

Michel Thierry May 31, 2018, 8:46 p.m. UTC | #1
On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:
> One thing we didn't really understand about the OA report is that the
> ContextID field (dword 2) is copy of the context descriptor (dword 1).
> 
> On Gen8->10 and without using GuC we didn't notice the issue because
> we only checked the 21bits of the ContextID field in the OA reports
> which matches exactly the hw_id stored into the context descriptor.
> 
> When using GuC submission we have an issue of a non matching hw_id
> because GuC uses bit 20 of the hw_id to signal proxy submission. This
> change makes introduces a mask to compare only the relevant bits.
Choose one: makes or introduces ;)

> 
> On ICL the context descriptor format has changed and we failed to
> address this. On top of using a mask we also need to shift the bits
> properly.
> 

Someone is going to complain this should be two patches (one to address 
the GuC-ness and one for Gen11), but not me.

Reviewed-by: Michel Thierry <michel.thierry@intel.com>


> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
> BSpec: 1237
> Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
> ---
>   drivers/gpu/drm/i915/i915_drv.h  |   1 +
>   drivers/gpu/drm/i915/i915_perf.c | 123 ++++++++++++++++++++++++-------
>   drivers/gpu/drm/i915/intel_lrc.c |   5 ++
>   3 files changed, 101 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 58ab9259fb73..0ccda488a8db 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1953,6 +1953,7 @@ struct drm_i915_private {
>   
>   			struct intel_context *pinned_ctx;
>   			u32 specific_ctx_id;
> +			u32 specific_ctx_id_mask;
>   
>   			struct hrtimer poll_check_timer;
>   			wait_queue_head_t poll_wq;
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 4a62024cbf85..d5e5d4635f1f 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>   			continue;
>   		}
>   
> -		/*
> -		 * XXX: Just keep the lower 21 bits for now since I'm not
> -		 * entirely sure if the HW touches any of the higher bits in
> -		 * this field
> -		 */
> -		ctx_id = report32[2] & 0x1fffff;
> +		ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
>   
>   		/*
>   		 * Squash whatever is in the CTX_ID field if it's marked as
> @@ -1204,6 +1199,36 @@ static int i915_oa_read(struct i915_perf_stream *stream,
>   	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
>   }
>   
> +static int oa_get_render_lrca(struct drm_i915_private *i915,
> +			      struct i915_gem_context *ctx,
> +			      u32 *lrca)
> +{
> +	struct intel_engine_cs *engine = i915->engine[RCS];
> +	struct intel_context *ce;
> +	int ret;
> +
> +	ret = i915_mutex_lock_interruptible(&i915->drm);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * As the ID is the gtt offset of the context's vma we
> +	 * pin the vma to ensure the ID remains fixed.
> +	 *
> +	 * NB: implied RCS engine...
> +	 */
> +	ce = intel_context_pin(ctx, engine);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	if (IS_ERR(ce))
> +		return PTR_ERR(ce);
> +
> +	i915->perf.oa.pinned_ctx = ce;
> +
> +	*lrca = i915_ggtt_offset(ce->state);
> +
> +	return 0;
> +}
> +
>   /**
>    * oa_get_render_ctx_id - determine and hold ctx hw id
>    * @stream: An i915-perf stream opened for OA metrics
> @@ -1216,40 +1241,81 @@ static int i915_oa_read(struct i915_perf_stream *stream,
>    */
>   static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>   {
> -	struct drm_i915_private *dev_priv = stream->dev_priv;
> +	struct drm_i915_private *i915 = stream->dev_priv;
>   
> -	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
> -		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
> -	} else {
> -		struct intel_engine_cs *engine = dev_priv->engine[RCS];
> -		struct intel_context *ce;
> +	switch (INTEL_GEN(i915)) {
> +	case 7: {
>   		int ret;
>   
> -		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> +		ret = oa_get_render_lrca(i915, stream->ctx,
> +					 &i915->perf.oa.specific_ctx_id);
>   		if (ret)
>   			return ret;
>   
>   		/*
> -		 * As the ID is the gtt offset of the context's vma we
> -		 * pin the vma to ensure the ID remains fixed.
> -		 *
> -		 * NB: implied RCS engine...
> +		 * On Haswell we don't do any post processing of the reports
> +		 * and don't need to use the mask.
>   		 */
> -		ce = intel_context_pin(stream->ctx, engine);
> -		mutex_unlock(&dev_priv->drm.struct_mutex);
> -		if (IS_ERR(ce))
> -			return PTR_ERR(ce);
> +		i915->perf.oa.specific_ctx_id_mask = 0;
> +		break;
> +	}
>   
> -		dev_priv->perf.oa.pinned_ctx = ce;
> +	case 8:
> +	case 9:
> +	case 10:
> +		if (USES_GUC_SUBMISSION(i915)) {
> +			u32 lrca;
> +			int ret;
>   
> -		/*
> -		 * Explicitly track the ID (instead of calling
> -		 * i915_ggtt_offset() on the fly) considering the difference
> -		 * with gen8+ and execlists
> -		 */
> -		dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
> +			ret = oa_get_render_lrca(i915, stream->ctx, &lrca);
> +			if (ret)
> +				return ret;
> +
> +			/*
> +			 * The LRCA is aligned to a page. As a result the
> +			 * lower 12bits are always at 0 and reused in the
> +			 * context descriptor for some flags. They won't be
> +			 * part of the context ID in the OA reports, so squash
> +			 * those lower bits.
> +			 */
> +			i915->perf.oa.specific_ctx_id =
> +				(lrca + LRC_HEADER_PAGES * PAGE_SIZE) >> 12;
> +
> +			/*
> +			 * GuC uses the top bit to signal proxy submission, so
> +			 * ignore that bit if using GuC.
> +			 */
> +			i915->perf.oa.specific_ctx_id_mask =
> +				(1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
> +		} else {
> +			i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;
> +			i915->perf.oa.specific_ctx_id_mask =
> +				(1U << GEN8_CTX_ID_WIDTH) - 1;
> +		}
> +		break;
> +
> +	case 11: {
> +		struct intel_engine_cs *engine = i915->engine[RCS];
> +
> +		i915->perf.oa.specific_ctx_id =
> +			stream->ctx->hw_id << (GEN11_SW_CTX_ID_SHIFT - 32) |
> +			engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
> +			engine->class << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
> +		i915->perf.oa.specific_ctx_id_mask =
> +			((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) |
> +			((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
> +			((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
> +		break;
>   	}
>   
> +	default:
> +		MISSING_CASE(INTEL_GEN(i915));
> +	}
> +
> +	DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
> +			 i915->perf.oa.specific_ctx_id,
> +			 i915->perf.oa.specific_ctx_id_mask);
> +
>   	return 0;
>   }
>   
> @@ -1266,6 +1332,7 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
>   	struct intel_context *ce;
>   
>   	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
> +	dev_priv->perf.oa.specific_ctx_id_mask = 0;
>   
>   	ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
>   	if (ce) {
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 94f9c4795190..7d63c6d2f687 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -233,6 +233,11 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
>   								/* bits 12-31 */
>   	GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
>   
> +	/*
> +	 * The following 32bits are copied into the OA reports (dword 2).
> +	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
> +	 * anything below.
> +	 */
>   	if (INTEL_GEN(ctx->i915) >= 11) {
>   		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
>   		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
>
Lionel Landwerlin May 31, 2018, 9:11 p.m. UTC | #2
On 31/05/18 21:46, Michel Thierry wrote:
> On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:
>> One thing we didn't really understand about the OA report is that the
>> ContextID field (dword 2) is copy of the context descriptor (dword 1).
>>
>> On Gen8->10 and without using GuC we didn't notice the issue because
>> we only checked the 21bits of the ContextID field in the OA reports
>> which matches exactly the hw_id stored into the context descriptor.
>>
>> When using GuC submission we have an issue of a non matching hw_id
>> because GuC uses bit 20 of the hw_id to signal proxy submission. This
>> change makes introduces a mask to compare only the relevant bits.
> Choose one: makes or introduces ;)
>
>>
>> On ICL the context descriptor format has changed and we failed to
>> address this. On top of using a mask we also need to shift the bits
>> properly.
>>
>
> Someone is going to complain this should be two patches (one to 
> address the GuC-ness and one for Gen11), but not me.
>
> Reviewed-by: Michel Thierry <michel.thierry@intel.com>
>

Kind of agree, but that's a pain.
We can put it as a fix of the first commit that enabled gen8 and I'll do 
the backport in stable versions.

Cheers,

-
Lionel

>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
>> BSpec: 1237
>> Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h  |   1 +
>>   drivers/gpu/drm/i915/i915_perf.c | 123 ++++++++++++++++++++++++-------
>>   drivers/gpu/drm/i915/intel_lrc.c |   5 ++
>>   3 files changed, 101 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 58ab9259fb73..0ccda488a8db 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1953,6 +1953,7 @@ struct drm_i915_private {
>>                 struct intel_context *pinned_ctx;
>>               u32 specific_ctx_id;
>> +            u32 specific_ctx_id_mask;
>>                 struct hrtimer poll_check_timer;
>>               wait_queue_head_t poll_wq;
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
>> b/drivers/gpu/drm/i915/i915_perf.c
>> index 4a62024cbf85..d5e5d4635f1f 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct 
>> i915_perf_stream *stream,
>>               continue;
>>           }
>>   -        /*
>> -         * XXX: Just keep the lower 21 bits for now since I'm not
>> -         * entirely sure if the HW touches any of the higher bits in
>> -         * this field
>> -         */
>> -        ctx_id = report32[2] & 0x1fffff;
>> +        ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
>>             /*
>>            * Squash whatever is in the CTX_ID field if it's marked as
>> @@ -1204,6 +1199,36 @@ static int i915_oa_read(struct 
>> i915_perf_stream *stream,
>>       return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
>>   }
>>   +static int oa_get_render_lrca(struct drm_i915_private *i915,
>> +                  struct i915_gem_context *ctx,
>> +                  u32 *lrca)
>> +{
>> +    struct intel_engine_cs *engine = i915->engine[RCS];
>> +    struct intel_context *ce;
>> +    int ret;
>> +
>> +    ret = i915_mutex_lock_interruptible(&i915->drm);
>> +    if (ret)
>> +        return ret;
>> +
>> +    /*
>> +     * As the ID is the gtt offset of the context's vma we
>> +     * pin the vma to ensure the ID remains fixed.
>> +     *
>> +     * NB: implied RCS engine...
>> +     */
>> +    ce = intel_context_pin(ctx, engine);
>> +    mutex_unlock(&i915->drm.struct_mutex);
>> +    if (IS_ERR(ce))
>> +        return PTR_ERR(ce);
>> +
>> +    i915->perf.oa.pinned_ctx = ce;
>> +
>> +    *lrca = i915_ggtt_offset(ce->state);
>> +
>> +    return 0;
>> +}
>> +
>>   /**
>>    * oa_get_render_ctx_id - determine and hold ctx hw id
>>    * @stream: An i915-perf stream opened for OA metrics
>> @@ -1216,40 +1241,81 @@ static int i915_oa_read(struct 
>> i915_perf_stream *stream,
>>    */
>>   static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>>   {
>> -    struct drm_i915_private *dev_priv = stream->dev_priv;
>> +    struct drm_i915_private *i915 = stream->dev_priv;
>>   -    if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
>> -        dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
>> -    } else {
>> -        struct intel_engine_cs *engine = dev_priv->engine[RCS];
>> -        struct intel_context *ce;
>> +    switch (INTEL_GEN(i915)) {
>> +    case 7: {
>>           int ret;
>>   -        ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>> +        ret = oa_get_render_lrca(i915, stream->ctx,
>> +                     &i915->perf.oa.specific_ctx_id);
>>           if (ret)
>>               return ret;
>>             /*
>> -         * As the ID is the gtt offset of the context's vma we
>> -         * pin the vma to ensure the ID remains fixed.
>> -         *
>> -         * NB: implied RCS engine...
>> +         * On Haswell we don't do any post processing of the reports
>> +         * and don't need to use the mask.
>>            */
>> -        ce = intel_context_pin(stream->ctx, engine);
>> -        mutex_unlock(&dev_priv->drm.struct_mutex);
>> -        if (IS_ERR(ce))
>> -            return PTR_ERR(ce);
>> +        i915->perf.oa.specific_ctx_id_mask = 0;
>> +        break;
>> +    }
>>   -        dev_priv->perf.oa.pinned_ctx = ce;
>> +    case 8:
>> +    case 9:
>> +    case 10:
>> +        if (USES_GUC_SUBMISSION(i915)) {
>> +            u32 lrca;
>> +            int ret;
>>   -        /*
>> -         * Explicitly track the ID (instead of calling
>> -         * i915_ggtt_offset() on the fly) considering the difference
>> -         * with gen8+ and execlists
>> -         */
>> -        dev_priv->perf.oa.specific_ctx_id = 
>> i915_ggtt_offset(ce->state);
>> +            ret = oa_get_render_lrca(i915, stream->ctx, &lrca);
>> +            if (ret)
>> +                return ret;
>> +
>> +            /*
>> +             * The LRCA is aligned to a page. As a result the
>> +             * lower 12bits are always at 0 and reused in the
>> +             * context descriptor for some flags. They won't be
>> +             * part of the context ID in the OA reports, so squash
>> +             * those lower bits.
>> +             */
>> +            i915->perf.oa.specific_ctx_id =
>> +                (lrca + LRC_HEADER_PAGES * PAGE_SIZE) >> 12;
>> +
>> +            /*
>> +             * GuC uses the top bit to signal proxy submission, so
>> +             * ignore that bit if using GuC.
>> +             */
>> +            i915->perf.oa.specific_ctx_id_mask =
>> +                (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
>> +        } else {
>> +            i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;
>> +            i915->perf.oa.specific_ctx_id_mask =
>> +                (1U << GEN8_CTX_ID_WIDTH) - 1;
>> +        }
>> +        break;
>> +
>> +    case 11: {
>> +        struct intel_engine_cs *engine = i915->engine[RCS];
>> +
>> +        i915->perf.oa.specific_ctx_id =
>> +            stream->ctx->hw_id << (GEN11_SW_CTX_ID_SHIFT - 32) |
>> +            engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
>> +            engine->class << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
>> +        i915->perf.oa.specific_ctx_id_mask =
>> +            ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << 
>> (GEN11_SW_CTX_ID_SHIFT - 32) |
>> +            ((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << 
>> (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
>> +            ((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << 
>> (GEN11_ENGINE_CLASS_SHIFT - 32);
>> +        break;
>>       }
>>   +    default:
>> +        MISSING_CASE(INTEL_GEN(i915));
>> +    }
>> +
>> +    DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
>> +             i915->perf.oa.specific_ctx_id,
>> +             i915->perf.oa.specific_ctx_id_mask);
>> +
>>       return 0;
>>   }
>>   @@ -1266,6 +1332,7 @@ static void oa_put_render_ctx_id(struct 
>> i915_perf_stream *stream)
>>       struct intel_context *ce;
>>         dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> +    dev_priv->perf.oa.specific_ctx_id_mask = 0;
>>         ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
>>       if (ce) {
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
>> b/drivers/gpu/drm/i915/intel_lrc.c
>> index 94f9c4795190..7d63c6d2f687 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -233,6 +233,11 @@ intel_lr_context_descriptor_update(struct 
>> i915_gem_context *ctx,
>>                                   /* bits 12-31 */
>>       GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
>>   +    /*
>> +     * The following 32bits are copied into the OA reports (dword 2).
>> +     * Consider updating oa_get_render_ctx_id in i915_perf.c when 
>> changing
>> +     * anything below.
>> +     */
>>       if (INTEL_GEN(ctx->i915) >= 11) {
>>           GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
>>           desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
>>
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 58ab9259fb73..0ccda488a8db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1953,6 +1953,7 @@  struct drm_i915_private {
 
 			struct intel_context *pinned_ctx;
 			u32 specific_ctx_id;
+			u32 specific_ctx_id_mask;
 
 			struct hrtimer poll_check_timer;
 			wait_queue_head_t poll_wq;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4a62024cbf85..d5e5d4635f1f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -738,12 +738,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		/*
-		 * XXX: Just keep the lower 21 bits for now since I'm not
-		 * entirely sure if the HW touches any of the higher bits in
-		 * this field
-		 */
-		ctx_id = report32[2] & 0x1fffff;
+		ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
 
 		/*
 		 * Squash whatever is in the CTX_ID field if it's marked as
@@ -1204,6 +1199,36 @@  static int i915_oa_read(struct i915_perf_stream *stream,
 	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
 }
 
+static int oa_get_render_lrca(struct drm_i915_private *i915,
+			      struct i915_gem_context *ctx,
+			      u32 *lrca)
+{
+	struct intel_engine_cs *engine = i915->engine[RCS];
+	struct intel_context *ce;
+	int ret;
+
+	ret = i915_mutex_lock_interruptible(&i915->drm);
+	if (ret)
+		return ret;
+
+	/*
+	 * As the ID is the gtt offset of the context's vma we
+	 * pin the vma to ensure the ID remains fixed.
+	 *
+	 * NB: implied RCS engine...
+	 */
+	ce = intel_context_pin(ctx, engine);
+	mutex_unlock(&i915->drm.struct_mutex);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
+
+	i915->perf.oa.pinned_ctx = ce;
+
+	*lrca = i915_ggtt_offset(ce->state);
+
+	return 0;
+}
+
 /**
  * oa_get_render_ctx_id - determine and hold ctx hw id
  * @stream: An i915-perf stream opened for OA metrics
@@ -1216,40 +1241,81 @@  static int i915_oa_read(struct i915_perf_stream *stream,
  */
 static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 {
-	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct drm_i915_private *i915 = stream->dev_priv;
 
-	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
-		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
-	} else {
-		struct intel_engine_cs *engine = dev_priv->engine[RCS];
-		struct intel_context *ce;
+	switch (INTEL_GEN(i915)) {
+	case 7: {
 		int ret;
 
-		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+		ret = oa_get_render_lrca(i915, stream->ctx,
+					 &i915->perf.oa.specific_ctx_id);
 		if (ret)
 			return ret;
 
 		/*
-		 * As the ID is the gtt offset of the context's vma we
-		 * pin the vma to ensure the ID remains fixed.
-		 *
-		 * NB: implied RCS engine...
+		 * On Haswell we don't do any post processing of the reports
+		 * and don't need to use the mask.
 		 */
-		ce = intel_context_pin(stream->ctx, engine);
-		mutex_unlock(&dev_priv->drm.struct_mutex);
-		if (IS_ERR(ce))
-			return PTR_ERR(ce);
+		i915->perf.oa.specific_ctx_id_mask = 0;
+		break;
+	}
 
-		dev_priv->perf.oa.pinned_ctx = ce;
+	case 8:
+	case 9:
+	case 10:
+		if (USES_GUC_SUBMISSION(i915)) {
+			u32 lrca;
+			int ret;
 
-		/*
-		 * Explicitly track the ID (instead of calling
-		 * i915_ggtt_offset() on the fly) considering the difference
-		 * with gen8+ and execlists
-		 */
-		dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
+			ret = oa_get_render_lrca(i915, stream->ctx, &lrca);
+			if (ret)
+				return ret;
+
+			/*
+			 * The LRCA is aligned to a page. As a result the
+			 * lower 12bits are always at 0 and reused in the
+			 * context descriptor for some flags. They won't be
+			 * part of the context ID in the OA reports, so squash
+			 * those lower bits.
+			 */
+			i915->perf.oa.specific_ctx_id =
+				(lrca + LRC_HEADER_PAGES * PAGE_SIZE) >> 12;
+
+			/*
+			 * GuC uses the top bit to signal proxy submission, so
+			 * ignore that bit if using GuC.
+			 */
+			i915->perf.oa.specific_ctx_id_mask =
+				(1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
+		} else {
+			i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;
+			i915->perf.oa.specific_ctx_id_mask =
+				(1U << GEN8_CTX_ID_WIDTH) - 1;
+		}
+		break;
+
+	case 11: {
+		struct intel_engine_cs *engine = i915->engine[RCS];
+
+		i915->perf.oa.specific_ctx_id =
+			stream->ctx->hw_id << (GEN11_SW_CTX_ID_SHIFT - 32) |
+			engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
+			engine->class << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
+		i915->perf.oa.specific_ctx_id_mask =
+			((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) |
+			((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
+			((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
+		break;
 	}
 
+	default:
+		MISSING_CASE(INTEL_GEN(i915));
+	}
+
+	DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
+			 i915->perf.oa.specific_ctx_id,
+			 i915->perf.oa.specific_ctx_id_mask);
+
 	return 0;
 }
 
@@ -1266,6 +1332,7 @@  static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 	struct intel_context *ce;
 
 	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+	dev_priv->perf.oa.specific_ctx_id_mask = 0;
 
 	ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
 	if (ce) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 94f9c4795190..7d63c6d2f687 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -233,6 +233,11 @@  intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 								/* bits 12-31 */
 	GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
 
+	/*
+	 * The following 32bits are copied into the OA reports (dword 2).
+	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
+	 * anything below.
+	 */
 	if (INTEL_GEN(ctx->i915) >= 11) {
 		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
 		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;