diff mbox series

[v3,04/16] drm/i915/perf: Determine gen12 oa ctx offset at runtime

Message ID 20221010181434.513477-5-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series Add DG2 OA support | expand

Commit Message

Umesh Nerlige Ramappa Oct. 10, 2022, 6:14 p.m. UTC
Some SKUs of same gen12 platform may have different oactxctrl
offsets. For gen12, determine oactxctrl offsets at runtime.

v2: (Lionel)
- Move MI definitions to intel_gpu_commands.h
- Ensure __find_reg_in_lri does read past context image size

v3: (Ashutosh)
- Drop unnecessary use of double underscores
- fix find_reg_in_lri
- Return error if oa context offset is U32_MAX
- Error out if oa_ctx_ctrl_offset does not find offset

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |   4 +
 drivers/gpu/drm/i915/i915_perf.c             | 154 +++++++++++++++----
 drivers/gpu/drm/i915/i915_perf_oa_regs.h     |   2 +-
 3 files changed, 129 insertions(+), 31 deletions(-)

Comments

Dixit, Ashutosh Oct. 11, 2022, 12:17 a.m. UTC | #1
On Mon, 10 Oct 2022 11:14:22 -0700, Umesh Nerlige Ramappa wrote:

Hi Umesh,

> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index cd57b5836386..b292aa39633e 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1358,6 +1358,68 @@ static int gen12_get_render_context_id(struct i915_perf_stream *stream)
>	return 0;
>  }
>
> +#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
> +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
> +{
> +	u32 idx = *offset;
> +	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
> +	bool found = false;

My recommendation would be to put something like this here:

	if (MI_LRI_LEN(state[idx]) & 0x1)
		drm_warn("MI_LRI instruction with odd length\n");

Because we expect the MI_LRI length to even and I am not sure what is in
the context image. But maybe not needed?

> @@ -2436,15 +2514,18 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>		},
>	};
>
> -	/* Modify the context image of pinned context with regs_context*/
> -	err = intel_context_lock_pinned(ce);
> -	if (err)
> -		return err;
> +	/* Modify the context image of pinned context with regs_context */
> +	if (valid_oactxctrl_offset(offset)) {

As I said before, this check is not needed, if we didn't have valid a
offset we should return error from oa_get_render_ctx_id. For if we have
this check and offset is not valid, can we skip this code and will things
still work? Or do we need to return an error from
gen12_configure_oar_context?

> +		err = intel_context_lock_pinned(ce);
> +		if (err)
> +			return err;
>
> -	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
> -	intel_context_unlock_pinned(ce);
> -	if (err)
> -		return err;
> +		err = gen8_modify_context(ce, regs_context,
> +					  ARRAY_SIZE(regs_context));
> +		intel_context_unlock_pinned(ce);
> +		if (err)
> +			return err;
> +	}

With the if statement removed or handled otherwise, this is:

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Umesh Nerlige Ramappa Oct. 11, 2022, 4:36 p.m. UTC | #2
On Mon, Oct 10, 2022 at 05:17:44PM -0700, Dixit, Ashutosh wrote:
>On Mon, 10 Oct 2022 11:14:22 -0700, Umesh Nerlige Ramappa wrote:
>
>Hi Umesh,
>
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index cd57b5836386..b292aa39633e 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -1358,6 +1358,68 @@ static int gen12_get_render_context_id(struct i915_perf_stream *stream)
>>	return 0;
>>  }
>>
>> +#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
>> +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
>> +{
>> +	u32 idx = *offset;
>> +	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
>> +	bool found = false;
>
>My recommendation would be to put something like this here:
>
>	if (MI_LRI_LEN(state[idx]) & 0x1)
>		drm_warn("MI_LRI instruction with odd length\n");
>
>Because we expect the MI_LRI length to even and I am not sure what is in
>the context image. But maybe not needed?

will add.

>
>> @@ -2436,15 +2514,18 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>>		},
>>	};
>>
>> -	/* Modify the context image of pinned context with regs_context*/
>> -	err = intel_context_lock_pinned(ce);
>> -	if (err)
>> -		return err;
>> +	/* Modify the context image of pinned context with regs_context */
>> +	if (valid_oactxctrl_offset(offset)) {
>
>As I said before, this check is not needed, if we didn't have valid a
>offset we should return error from oa_get_render_ctx_id. For if we have
>this check and offset is not valid, can we skip this code and will things
>still work? Or do we need to return an error from
>gen12_configure_oar_context?

I missed that from the previous comments. Will add.

>
>> +		err = intel_context_lock_pinned(ce);
>> +		if (err)
>> +			return err;
>>
>> -	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
>> -	intel_context_unlock_pinned(ce);
>> -	if (err)
>> -		return err;
>> +		err = gen8_modify_context(ce, regs_context,
>> +					  ARRAY_SIZE(regs_context));
>> +		intel_context_unlock_pinned(ce);
>> +		if (err)
>> +			return err;
>> +	}
>
>With the if statement removed or handled otherwise, this is:
>
>Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Jani Nikula Oct. 11, 2022, 5:47 p.m. UTC | #3
On Mon, 10 Oct 2022, Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> wrote:
> Some SKUs of same gen12 platform may have different oactxctrl
> offsets. For gen12, determine oactxctrl offsets at runtime.
>
> v2: (Lionel)
> - Move MI definitions to intel_gpu_commands.h
> - Ensure __find_reg_in_lri does read past context image size
>
> v3: (Ashutosh)
> - Drop unnecessary use of double underscores
> - fix find_reg_in_lri
> - Return error if oa context offset is U32_MAX
> - Error out if oa_ctx_ctrl_offset does not find offset
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |   4 +
>  drivers/gpu/drm/i915/i915_perf.c             | 154 +++++++++++++++----
>  drivers/gpu/drm/i915/i915_perf_oa_regs.h     |   2 +-
>  3 files changed, 129 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index d4e9702d3c8e..f50ea92910d9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -187,6 +187,10 @@
>  #define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
>  #define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
>  
> +#define MI_OPCODE(x)		(((x) >> 23) & 0x3f)
> +#define IS_MI_LRI_CMD(x)	(MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0)))
> +#define MI_LRI_LEN(x)		(((x) & 0xff) + 1)
> +
>  /*
>   * 3D instructions used by the kernel
>   */
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index cd57b5836386..b292aa39633e 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1358,6 +1358,68 @@ static int gen12_get_render_context_id(struct i915_perf_stream *stream)
>  	return 0;
>  }
>  
> +#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
> +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
> +{
> +	u32 idx = *offset;
> +	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);

I don't understand any of this stuff, but why are the offset, index and
length u32 instead of just, say, int?

> +	bool found = false;
> +
> +	idx++;
> +	for (; idx < len; idx += 2) {

I find the initialization of idx and len confusing, and thereby the
entire loop too.

BR,
Jani.


> +		if (state[idx] == reg) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	*offset = idx;
> +	return found;
> +}
> +
> +static u32 oa_context_image_offset(struct intel_context *ce, u32 reg)
> +{
> +	u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4;
> +	u32 *state = ce->lrc_reg_state;
> +
> +	for (offset = 0; offset < len; ) {
> +		if (IS_MI_LRI_CMD(state[offset])) {
> +			if (oa_find_reg_in_lri(state, reg, &offset, len))
> +				break;
> +		} else {
> +			offset++;
> +		}
> +	}
> +
> +	return offset < len ? offset : U32_MAX;
> +}
> +
> +static int set_oa_ctx_ctrl_offset(struct intel_context *ce)
> +{
> +	i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base);
> +	struct i915_perf *perf = &ce->engine->i915->perf;
> +	u32 offset = perf->ctx_oactxctrl_offset;
> +
> +	/* Do this only once. Failure is stored as offset of U32_MAX */
> +	if (offset)
> +		goto exit;
> +
> +	offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg));
> +	perf->ctx_oactxctrl_offset = offset;
> +
> +	drm_dbg(&ce->engine->i915->drm,
> +		"%s oa ctx control at 0x%08x dword offset\n",
> +		ce->engine->name, offset);
> +
> +exit:
> +	return valid_oactxctrl_offset(offset) ? 0 : -ENODEV;
> +}
> +
> +static bool engine_supports_mi_query(struct intel_engine_cs *engine)
> +{
> +	return engine->class == RENDER_CLASS;
> +}
> +
>  /**
>   * oa_get_render_ctx_id - determine and hold ctx hw id
>   * @stream: An i915-perf stream opened for OA metrics
> @@ -1377,6 +1439,21 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>  	if (IS_ERR(ce))
>  		return PTR_ERR(ce);
>  
> +	if (engine_supports_mi_query(stream->engine)) {
> +		/*
> +		 * We are enabling perf query here. If we don't find the context
> +		 * offset here, just return an error.
> +		 */
> +		ret = set_oa_ctx_ctrl_offset(ce);
> +		if (ret) {
> +			intel_context_unpin(ce);
> +			drm_err(&stream->perf->i915->drm,
> +				"Enabling perf query failed for %s\n",
> +				stream->engine->name);
> +			return ret;
> +		}
> +	}
> +
>  	switch (GRAPHICS_VER(ce->engine->i915)) {
>  	case 7: {
>  		/*
> @@ -2408,10 +2485,11 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>  	int err;
>  	struct intel_context *ce = stream->pinned_ctx;
>  	u32 format = stream->oa_buffer.format;
> +	u32 offset = stream->perf->ctx_oactxctrl_offset;
>  	struct flex regs_context[] = {
>  		{
>  			GEN8_OACTXCONTROL,
> -			stream->perf->ctx_oactxctrl_offset + 1,
> +			offset + 1,
>  			active ? GEN8_OA_COUNTER_RESUME : 0,
>  		},
>  	};
> @@ -2436,15 +2514,18 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>  		},
>  	};
>  
> -	/* Modify the context image of pinned context with regs_context*/
> -	err = intel_context_lock_pinned(ce);
> -	if (err)
> -		return err;
> +	/* Modify the context image of pinned context with regs_context */
> +	if (valid_oactxctrl_offset(offset)) {
> +		err = intel_context_lock_pinned(ce);
> +		if (err)
> +			return err;
>  
> -	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
> -	intel_context_unlock_pinned(ce);
> -	if (err)
> -		return err;
> +		err = gen8_modify_context(ce, regs_context,
> +					  ARRAY_SIZE(regs_context));
> +		intel_context_unlock_pinned(ce);
> +		if (err)
> +			return err;
> +	}
>  
>  	/* Apply regs_lri using LRI with pinned context */
>  	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
> @@ -2566,6 +2647,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>  			   const struct i915_oa_config *oa_config,
>  			   struct i915_active *active)
>  {
> +	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
>  	/* The MMIO offsets for Flex EU registers aren't contiguous */
>  	const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
>  #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
> @@ -2576,7 +2658,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>  		},
>  		{
>  			GEN8_OACTXCONTROL,
> -			stream->perf->ctx_oactxctrl_offset + 1,
> +			ctx_oactxctrl + 1,
>  		},
>  		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
>  		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
> @@ -4545,6 +4627,37 @@ static void oa_init_supported_formats(struct i915_perf *perf)
>  	}
>  }
>  
> +static void i915_perf_init_info(struct drm_i915_private *i915)
> +{
> +	struct i915_perf *perf = &i915->perf;
> +
> +	switch (GRAPHICS_VER(i915)) {
> +	case 8:
> +		perf->ctx_oactxctrl_offset = 0x120;
> +		perf->ctx_flexeu0_offset = 0x2ce;
> +		perf->gen8_valid_ctx_bit = BIT(25);
> +		break;
> +	case 9:
> +		perf->ctx_oactxctrl_offset = 0x128;
> +		perf->ctx_flexeu0_offset = 0x3de;
> +		perf->gen8_valid_ctx_bit = BIT(16);
> +		break;
> +	case 11:
> +		perf->ctx_oactxctrl_offset = 0x124;
> +		perf->ctx_flexeu0_offset = 0x78e;
> +		perf->gen8_valid_ctx_bit = BIT(16);
> +		break;
> +	case 12:
> +		/*
> +		 * Calculate offset at runtime in oa_pin_context for gen12 and
> +		 * cache the value in perf->ctx_oactxctrl_offset.
> +		 */
> +		break;
> +	default:
> +		MISSING_CASE(GRAPHICS_VER(i915));
> +	}
> +}
> +
>  /**
>   * i915_perf_init - initialize i915-perf state on module bind
>   * @i915: i915 device instance
> @@ -4583,6 +4696,7 @@ void i915_perf_init(struct drm_i915_private *i915)
>  		 * execlist mode by default.
>  		 */
>  		perf->ops.read = gen8_oa_read;
> +		i915_perf_init_info(i915);
>  
>  		if (IS_GRAPHICS_VER(i915, 8, 9)) {
>  			perf->ops.is_valid_b_counter_reg =
> @@ -4602,18 +4716,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>  			perf->ops.disable_metric_set = gen8_disable_metric_set;
>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
> -
> -			if (GRAPHICS_VER(i915) == 8) {
> -				perf->ctx_oactxctrl_offset = 0x120;
> -				perf->ctx_flexeu0_offset = 0x2ce;
> -
> -				perf->gen8_valid_ctx_bit = BIT(25);
> -			} else {
> -				perf->ctx_oactxctrl_offset = 0x128;
> -				perf->ctx_flexeu0_offset = 0x3de;
> -
> -				perf->gen8_valid_ctx_bit = BIT(16);
> -			}
>  		} else if (GRAPHICS_VER(i915) == 11) {
>  			perf->ops.is_valid_b_counter_reg =
>  				gen7_is_valid_b_counter_addr;
> @@ -4627,11 +4729,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>  			perf->ops.disable_metric_set = gen11_disable_metric_set;
>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
> -
> -			perf->ctx_oactxctrl_offset = 0x124;
> -			perf->ctx_flexeu0_offset = 0x78e;
> -
> -			perf->gen8_valid_ctx_bit = BIT(16);
>  		} else if (GRAPHICS_VER(i915) == 12) {
>  			perf->ops.is_valid_b_counter_reg =
>  				gen12_is_valid_b_counter_addr;
> @@ -4645,9 +4742,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>  			perf->ops.enable_metric_set = gen12_enable_metric_set;
>  			perf->ops.disable_metric_set = gen12_disable_metric_set;
>  			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
> -
> -			perf->ctx_flexeu0_offset = 0;
> -			perf->ctx_oactxctrl_offset = 0x144;
>  		}
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> index f31c9f13a9fc..0ef3562ff4aa 100644
> --- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> +++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
> @@ -97,7 +97,7 @@
>  #define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
>  #define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
>  
> -#define GEN12_OACTXCONTROL _MMIO(0x2360)
> +#define GEN12_OACTXCONTROL(base) _MMIO((base) + 0x360)
>  #define GEN12_OAR_OASTATUS _MMIO(0x2968)
>  
>  /* Gen12 OAG unit */
Umesh Nerlige Ramappa Oct. 11, 2022, 6:19 p.m. UTC | #4
On Tue, Oct 11, 2022 at 08:47:00PM +0300, Jani Nikula wrote:
>On Mon, 10 Oct 2022, Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> wrote:
>> Some SKUs of same gen12 platform may have different oactxctrl
>> offsets. For gen12, determine oactxctrl offsets at runtime.
>>
>> v2: (Lionel)
>> - Move MI definitions to intel_gpu_commands.h
>> - Ensure __find_reg_in_lri does read past context image size
>>
>> v3: (Ashutosh)
>> - Drop unnecessary use of double underscores
>> - fix find_reg_in_lri
>> - Return error if oa context offset is U32_MAX
>> - Error out if oa_ctx_ctrl_offset does not find offset
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> ---
>>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |   4 +
>>  drivers/gpu/drm/i915/i915_perf.c             | 154 +++++++++++++++----
>>  drivers/gpu/drm/i915/i915_perf_oa_regs.h     |   2 +-
>>  3 files changed, 129 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> index d4e9702d3c8e..f50ea92910d9 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> @@ -187,6 +187,10 @@
>>  #define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
>>  #define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
>>
>> +#define MI_OPCODE(x)		(((x) >> 23) & 0x3f)
>> +#define IS_MI_LRI_CMD(x)	(MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0)))
>> +#define MI_LRI_LEN(x)		(((x) & 0xff) + 1)
>> +
>>  /*
>>   * 3D instructions used by the kernel
>>   */
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index cd57b5836386..b292aa39633e 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -1358,6 +1358,68 @@ static int gen12_get_render_context_id(struct i915_perf_stream *stream)
>>  	return 0;
>>  }
>>
>> +#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
>> +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
>> +{
>> +	u32 idx = *offset;
>> +	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
>
>I don't understand any of this stuff, but why are the offset, index and
>length u32 instead of just, say, int?

I can use int, but the ce->engine->context_size is u32 and we are 
parsing the context image here, so I have just used the same type for 
offset, index, length.

Any guideline/recommendation to choose int over u32?

>
>> +	bool found = false;
>> +
>> +	idx++;
>> +	for (; idx < len; idx += 2) {
>
>I find the initialization of idx and len confusing, and thereby the
>entire loop too.

Considering that the context image is a collection of MI_LRI commands 
with each command having this format:

dword 0: MI_LRI
dword 1: reg offset
dword 2: reg value
dword 3: reg offset
dword 4: reg value
...

oa_context_image_offset() and oa_find_reg_in_lri() are parsing this 
context image to look for a specific reg_offset. Once the offset is 
known, the OA code programs the reg_value for the context.

Thanks,
Umesh

>
>BR,
>Jani.
>
>
>> +		if (state[idx] == reg) {
>> +			found = true;
>> +			break;
>> +		}
>> +	}
>> +
>> +	*offset = idx;
>> +	return found;
>> +}
>> +
>> +static u32 oa_context_image_offset(struct intel_context *ce, u32 reg)
>> +{
>> +	u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4;
>> +	u32 *state = ce->lrc_reg_state;
>> +
>> +	for (offset = 0; offset < len; ) {
>> +		if (IS_MI_LRI_CMD(state[offset])) {
>> +			if (oa_find_reg_in_lri(state, reg, &offset, len))
>> +				break;
>> +		} else {
>> +			offset++;
>> +		}
>> +	}
>> +
>> +	return offset < len ? offset : U32_MAX;
>> +}
>> +
>> +static int set_oa_ctx_ctrl_offset(struct intel_context *ce)
>> +{
>> +	i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base);
>> +	struct i915_perf *perf = &ce->engine->i915->perf;
>> +	u32 offset = perf->ctx_oactxctrl_offset;
>> +
>> +	/* Do this only once. Failure is stored as offset of U32_MAX */
>> +	if (offset)
>> +		goto exit;
>> +
>> +	offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg));
>> +	perf->ctx_oactxctrl_offset = offset;
>> +
>> +	drm_dbg(&ce->engine->i915->drm,
>> +		"%s oa ctx control at 0x%08x dword offset\n",
>> +		ce->engine->name, offset);
>> +
>> +exit:
>> +	return valid_oactxctrl_offset(offset) ? 0 : -ENODEV;
>> +}
>> +
>> +static bool engine_supports_mi_query(struct intel_engine_cs *engine)
>> +{
>> +	return engine->class == RENDER_CLASS;
>> +}
>> +
>>  /**
>>   * oa_get_render_ctx_id - determine and hold ctx hw id
>>   * @stream: An i915-perf stream opened for OA metrics
>> @@ -1377,6 +1439,21 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>>  	if (IS_ERR(ce))
>>  		return PTR_ERR(ce);
>>
>> +	if (engine_supports_mi_query(stream->engine)) {
>> +		/*
>> +		 * We are enabling perf query here. If we don't find the context
>> +		 * offset here, just return an error.
>> +		 */
>> +		ret = set_oa_ctx_ctrl_offset(ce);
>> +		if (ret) {
>> +			intel_context_unpin(ce);
>> +			drm_err(&stream->perf->i915->drm,
>> +				"Enabling perf query failed for %s\n",
>> +				stream->engine->name);
>> +			return ret;
>> +		}
>> +	}
>> +
>>  	switch (GRAPHICS_VER(ce->engine->i915)) {
>>  	case 7: {
>>  		/*
>> @@ -2408,10 +2485,11 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>>  	int err;
>>  	struct intel_context *ce = stream->pinned_ctx;
>>  	u32 format = stream->oa_buffer.format;
>> +	u32 offset = stream->perf->ctx_oactxctrl_offset;
>>  	struct flex regs_context[] = {
>>  		{
>>  			GEN8_OACTXCONTROL,
>> -			stream->perf->ctx_oactxctrl_offset + 1,
>> +			offset + 1,
>>  			active ? GEN8_OA_COUNTER_RESUME : 0,
>>  		},
>>  	};
>> @@ -2436,15 +2514,18 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>>  		},
>>  	};
>>
>> -	/* Modify the context image of pinned context with regs_context*/
>> -	err = intel_context_lock_pinned(ce);
>> -	if (err)
>> -		return err;
>> +	/* Modify the context image of pinned context with regs_context */
>> +	if (valid_oactxctrl_offset(offset)) {
>> +		err = intel_context_lock_pinned(ce);
>> +		if (err)
>> +			return err;
>>
>> -	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
>> -	intel_context_unlock_pinned(ce);
>> -	if (err)
>> -		return err;
>> +		err = gen8_modify_context(ce, regs_context,
>> +					  ARRAY_SIZE(regs_context));
>> +		intel_context_unlock_pinned(ce);
>> +		if (err)
>> +			return err;
>> +	}
>>
>>  	/* Apply regs_lri using LRI with pinned context */
>>  	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
>> @@ -2566,6 +2647,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>>  			   const struct i915_oa_config *oa_config,
>>  			   struct i915_active *active)
>>  {
>> +	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
>>  	/* The MMIO offsets for Flex EU registers aren't contiguous */
>>  	const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
>>  #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
>> @@ -2576,7 +2658,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>>  		},
>>  		{
>>  			GEN8_OACTXCONTROL,
>> -			stream->perf->ctx_oactxctrl_offset + 1,
>> +			ctx_oactxctrl + 1,
>>  		},
>>  		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
>>  		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
>> @@ -4545,6 +4627,37 @@ static void oa_init_supported_formats(struct i915_perf *perf)
>>  	}
>>  }
>>
>> +static void i915_perf_init_info(struct drm_i915_private *i915)
>> +{
>> +	struct i915_perf *perf = &i915->perf;
>> +
>> +	switch (GRAPHICS_VER(i915)) {
>> +	case 8:
>> +		perf->ctx_oactxctrl_offset = 0x120;
>> +		perf->ctx_flexeu0_offset = 0x2ce;
>> +		perf->gen8_valid_ctx_bit = BIT(25);
>> +		break;
>> +	case 9:
>> +		perf->ctx_oactxctrl_offset = 0x128;
>> +		perf->ctx_flexeu0_offset = 0x3de;
>> +		perf->gen8_valid_ctx_bit = BIT(16);
>> +		break;
>> +	case 11:
>> +		perf->ctx_oactxctrl_offset = 0x124;
>> +		perf->ctx_flexeu0_offset = 0x78e;
>> +		perf->gen8_valid_ctx_bit = BIT(16);
>> +		break;
>> +	case 12:
>> +		/*
>> +		 * Calculate offset at runtime in oa_pin_context for gen12 and
>> +		 * cache the value in perf->ctx_oactxctrl_offset.
>> +		 */
>> +		break;
>> +	default:
>> +		MISSING_CASE(GRAPHICS_VER(i915));
>> +	}
>> +}
>> +
>>  /**
>>   * i915_perf_init - initialize i915-perf state on module bind
>>   * @i915: i915 device instance
>> @@ -4583,6 +4696,7 @@ void i915_perf_init(struct drm_i915_private *i915)
>>  		 * execlist mode by default.
>>  		 */
>>  		perf->ops.read = gen8_oa_read;
>> +		i915_perf_init_info(i915);
>>
>>  		if (IS_GRAPHICS_VER(i915, 8, 9)) {
>>  			perf->ops.is_valid_b_counter_reg =
>> @@ -4602,18 +4716,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>>  			perf->ops.disable_metric_set = gen8_disable_metric_set;
>>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>> -
>> -			if (GRAPHICS_VER(i915) == 8) {
>> -				perf->ctx_oactxctrl_offset = 0x120;
>> -				perf->ctx_flexeu0_offset = 0x2ce;
>> -
>> -				perf->gen8_valid_ctx_bit = BIT(25);
>> -			} else {
>> -				perf->ctx_oactxctrl_offset = 0x128;
>> -				perf->ctx_flexeu0_offset = 0x3de;
>> -
>> -				perf->gen8_valid_ctx_bit = BIT(16);
>> -			}
>>  		} else if (GRAPHICS_VER(i915) == 11) {
>>  			perf->ops.is_valid_b_counter_reg =
>>  				gen7_is_valid_b_counter_addr;
>> @@ -4627,11 +4729,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>>  			perf->ops.disable_metric_set = gen11_disable_metric_set;
>>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>> -
>> -			perf->ctx_oactxctrl_offset = 0x124;
>> -			perf->ctx_flexeu0_offset = 0x78e;
>> -
>> -			perf->gen8_valid_ctx_bit = BIT(16);
>>  		} else if (GRAPHICS_VER(i915) == 12) {
>>  			perf->ops.is_valid_b_counter_reg =
>>  				gen12_is_valid_b_counter_addr;
>> @@ -4645,9 +4742,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>  			perf->ops.enable_metric_set = gen12_enable_metric_set;
>>  			perf->ops.disable_metric_set = gen12_disable_metric_set;
>>  			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
>> -
>> -			perf->ctx_flexeu0_offset = 0;
>> -			perf->ctx_oactxctrl_offset = 0x144;
>>  		}
>>  	}
>>
>> diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>> index f31c9f13a9fc..0ef3562ff4aa 100644
>> --- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>> +++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>> @@ -97,7 +97,7 @@
>>  #define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
>>  #define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
>>
>> -#define GEN12_OACTXCONTROL _MMIO(0x2360)
>> +#define GEN12_OACTXCONTROL(base) _MMIO((base) + 0x360)
>>  #define GEN12_OAR_OASTATUS _MMIO(0x2968)
>>
>>  /* Gen12 OAG unit */
>
>-- 
>Jani Nikula, Intel Open Source Graphics Center
Jani Nikula Oct. 12, 2022, 4:12 p.m. UTC | #5
On Tue, 11 Oct 2022, Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> wrote:
> On Tue, Oct 11, 2022 at 08:47:00PM +0300, Jani Nikula wrote:
>>On Mon, 10 Oct 2022, Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> wrote:
>>> Some SKUs of same gen12 platform may have different oactxctrl
>>> offsets. For gen12, determine oactxctrl offsets at runtime.
>>>
>>> v2: (Lionel)
>>> - Move MI definitions to intel_gpu_commands.h
>>> - Ensure __find_reg_in_lri does read past context image size
>>>
>>> v3: (Ashutosh)
>>> - Drop unnecessary use of double underscores
>>> - fix find_reg_in_lri
>>> - Return error if oa context offset is U32_MAX
>>> - Error out if oa_ctx_ctrl_offset does not find offset
>>>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |   4 +
>>>  drivers/gpu/drm/i915/i915_perf.c             | 154 +++++++++++++++----
>>>  drivers/gpu/drm/i915/i915_perf_oa_regs.h     |   2 +-
>>>  3 files changed, 129 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>>> index d4e9702d3c8e..f50ea92910d9 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>>> @@ -187,6 +187,10 @@
>>>  #define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
>>>  #define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
>>>
>>> +#define MI_OPCODE(x)		(((x) >> 23) & 0x3f)
>>> +#define IS_MI_LRI_CMD(x)	(MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0)))
>>> +#define MI_LRI_LEN(x)		(((x) & 0xff) + 1)
>>> +
>>>  /*
>>>   * 3D instructions used by the kernel
>>>   */
>>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>>> index cd57b5836386..b292aa39633e 100644
>>> --- a/drivers/gpu/drm/i915/i915_perf.c
>>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>>> @@ -1358,6 +1358,68 @@ static int gen12_get_render_context_id(struct i915_perf_stream *stream)
>>>  	return 0;
>>>  }
>>>
>>> +#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
>>> +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
>>> +{
>>> +	u32 idx = *offset;
>>> +	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
>>
>>I don't understand any of this stuff, but why are the offset, index and
>>length u32 instead of just, say, int?
>
> I can use int, but the ce->engine->context_size is u32 and we are 
> parsing the context image here, so I have just used the same type for 
> offset, index, length.
>
> Any guideline/recommendation to choose int over u32?

If it's an index in general, looping, regular computation, just plain
old int will often do. Use u32 and friends when you actually need a
specific size to map to hardware or registers etc.

I guess it's not so clear cut here, and mixing types not necessarily a
bright idea here either.

BR,
Jani.


>
>>
>>> +	bool found = false;
>>> +
>>> +	idx++;
>>> +	for (; idx < len; idx += 2) {
>>
>>I find the initialization of idx and len confusing, and thereby the
>>entire loop too.
>
> Considering that the context image is a collection of MI_LRI commands 
> with each command having this format:
>
> dword 0: MI_LRI
> dword 1: reg offset
> dword 2: reg value
> dword 3: reg offset
> dword 4: reg value
> ...
>
> oa_context_image_offset() and oa_find_reg_in_lri() are parsing this 
> context image to look for a specific reg_offset. Once the offset is 
> known, the OA code programs the reg_value for the context.
>
> Thanks,
> Umesh
>
>>
>>BR,
>>Jani.
>>
>>
>>> +		if (state[idx] == reg) {
>>> +			found = true;
>>> +			break;
>>> +		}
>>> +	}
>>> +
>>> +	*offset = idx;
>>> +	return found;
>>> +}
>>> +
>>> +static u32 oa_context_image_offset(struct intel_context *ce, u32 reg)
>>> +{
>>> +	u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4;
>>> +	u32 *state = ce->lrc_reg_state;
>>> +
>>> +	for (offset = 0; offset < len; ) {
>>> +		if (IS_MI_LRI_CMD(state[offset])) {
>>> +			if (oa_find_reg_in_lri(state, reg, &offset, len))
>>> +				break;
>>> +		} else {
>>> +			offset++;
>>> +		}
>>> +	}
>>> +
>>> +	return offset < len ? offset : U32_MAX;
>>> +}
>>> +
>>> +static int set_oa_ctx_ctrl_offset(struct intel_context *ce)
>>> +{
>>> +	i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base);
>>> +	struct i915_perf *perf = &ce->engine->i915->perf;
>>> +	u32 offset = perf->ctx_oactxctrl_offset;
>>> +
>>> +	/* Do this only once. Failure is stored as offset of U32_MAX */
>>> +	if (offset)
>>> +		goto exit;
>>> +
>>> +	offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg));
>>> +	perf->ctx_oactxctrl_offset = offset;
>>> +
>>> +	drm_dbg(&ce->engine->i915->drm,
>>> +		"%s oa ctx control at 0x%08x dword offset\n",
>>> +		ce->engine->name, offset);
>>> +
>>> +exit:
>>> +	return valid_oactxctrl_offset(offset) ? 0 : -ENODEV;
>>> +}
>>> +
>>> +static bool engine_supports_mi_query(struct intel_engine_cs *engine)
>>> +{
>>> +	return engine->class == RENDER_CLASS;
>>> +}
>>> +
>>>  /**
>>>   * oa_get_render_ctx_id - determine and hold ctx hw id
>>>   * @stream: An i915-perf stream opened for OA metrics
>>> @@ -1377,6 +1439,21 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>>>  	if (IS_ERR(ce))
>>>  		return PTR_ERR(ce);
>>>
>>> +	if (engine_supports_mi_query(stream->engine)) {
>>> +		/*
>>> +		 * We are enabling perf query here. If we don't find the context
>>> +		 * offset here, just return an error.
>>> +		 */
>>> +		ret = set_oa_ctx_ctrl_offset(ce);
>>> +		if (ret) {
>>> +			intel_context_unpin(ce);
>>> +			drm_err(&stream->perf->i915->drm,
>>> +				"Enabling perf query failed for %s\n",
>>> +				stream->engine->name);
>>> +			return ret;
>>> +		}
>>> +	}
>>> +
>>>  	switch (GRAPHICS_VER(ce->engine->i915)) {
>>>  	case 7: {
>>>  		/*
>>> @@ -2408,10 +2485,11 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>>>  	int err;
>>>  	struct intel_context *ce = stream->pinned_ctx;
>>>  	u32 format = stream->oa_buffer.format;
>>> +	u32 offset = stream->perf->ctx_oactxctrl_offset;
>>>  	struct flex regs_context[] = {
>>>  		{
>>>  			GEN8_OACTXCONTROL,
>>> -			stream->perf->ctx_oactxctrl_offset + 1,
>>> +			offset + 1,
>>>  			active ? GEN8_OA_COUNTER_RESUME : 0,
>>>  		},
>>>  	};
>>> @@ -2436,15 +2514,18 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream,
>>>  		},
>>>  	};
>>>
>>> -	/* Modify the context image of pinned context with regs_context*/
>>> -	err = intel_context_lock_pinned(ce);
>>> -	if (err)
>>> -		return err;
>>> +	/* Modify the context image of pinned context with regs_context */
>>> +	if (valid_oactxctrl_offset(offset)) {
>>> +		err = intel_context_lock_pinned(ce);
>>> +		if (err)
>>> +			return err;
>>>
>>> -	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
>>> -	intel_context_unlock_pinned(ce);
>>> -	if (err)
>>> -		return err;
>>> +		err = gen8_modify_context(ce, regs_context,
>>> +					  ARRAY_SIZE(regs_context));
>>> +		intel_context_unlock_pinned(ce);
>>> +		if (err)
>>> +			return err;
>>> +	}
>>>
>>>  	/* Apply regs_lri using LRI with pinned context */
>>>  	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
>>> @@ -2566,6 +2647,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>>>  			   const struct i915_oa_config *oa_config,
>>>  			   struct i915_active *active)
>>>  {
>>> +	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
>>>  	/* The MMIO offsets for Flex EU registers aren't contiguous */
>>>  	const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
>>>  #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
>>> @@ -2576,7 +2658,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream,
>>>  		},
>>>  		{
>>>  			GEN8_OACTXCONTROL,
>>> -			stream->perf->ctx_oactxctrl_offset + 1,
>>> +			ctx_oactxctrl + 1,
>>>  		},
>>>  		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
>>>  		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
>>> @@ -4545,6 +4627,37 @@ static void oa_init_supported_formats(struct i915_perf *perf)
>>>  	}
>>>  }
>>>
>>> +static void i915_perf_init_info(struct drm_i915_private *i915)
>>> +{
>>> +	struct i915_perf *perf = &i915->perf;
>>> +
>>> +	switch (GRAPHICS_VER(i915)) {
>>> +	case 8:
>>> +		perf->ctx_oactxctrl_offset = 0x120;
>>> +		perf->ctx_flexeu0_offset = 0x2ce;
>>> +		perf->gen8_valid_ctx_bit = BIT(25);
>>> +		break;
>>> +	case 9:
>>> +		perf->ctx_oactxctrl_offset = 0x128;
>>> +		perf->ctx_flexeu0_offset = 0x3de;
>>> +		perf->gen8_valid_ctx_bit = BIT(16);
>>> +		break;
>>> +	case 11:
>>> +		perf->ctx_oactxctrl_offset = 0x124;
>>> +		perf->ctx_flexeu0_offset = 0x78e;
>>> +		perf->gen8_valid_ctx_bit = BIT(16);
>>> +		break;
>>> +	case 12:
>>> +		/*
>>> +		 * Calculate offset at runtime in oa_pin_context for gen12 and
>>> +		 * cache the value in perf->ctx_oactxctrl_offset.
>>> +		 */
>>> +		break;
>>> +	default:
>>> +		MISSING_CASE(GRAPHICS_VER(i915));
>>> +	}
>>> +}
>>> +
>>>  /**
>>>   * i915_perf_init - initialize i915-perf state on module bind
>>>   * @i915: i915 device instance
>>> @@ -4583,6 +4696,7 @@ void i915_perf_init(struct drm_i915_private *i915)
>>>  		 * execlist mode by default.
>>>  		 */
>>>  		perf->ops.read = gen8_oa_read;
>>> +		i915_perf_init_info(i915);
>>>
>>>  		if (IS_GRAPHICS_VER(i915, 8, 9)) {
>>>  			perf->ops.is_valid_b_counter_reg =
>>> @@ -4602,18 +4716,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>>>  			perf->ops.disable_metric_set = gen8_disable_metric_set;
>>>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>>> -
>>> -			if (GRAPHICS_VER(i915) == 8) {
>>> -				perf->ctx_oactxctrl_offset = 0x120;
>>> -				perf->ctx_flexeu0_offset = 0x2ce;
>>> -
>>> -				perf->gen8_valid_ctx_bit = BIT(25);
>>> -			} else {
>>> -				perf->ctx_oactxctrl_offset = 0x128;
>>> -				perf->ctx_flexeu0_offset = 0x3de;
>>> -
>>> -				perf->gen8_valid_ctx_bit = BIT(16);
>>> -			}
>>>  		} else if (GRAPHICS_VER(i915) == 11) {
>>>  			perf->ops.is_valid_b_counter_reg =
>>>  				gen7_is_valid_b_counter_addr;
>>> @@ -4627,11 +4729,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>>  			perf->ops.enable_metric_set = gen8_enable_metric_set;
>>>  			perf->ops.disable_metric_set = gen11_disable_metric_set;
>>>  			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>>> -
>>> -			perf->ctx_oactxctrl_offset = 0x124;
>>> -			perf->ctx_flexeu0_offset = 0x78e;
>>> -
>>> -			perf->gen8_valid_ctx_bit = BIT(16);
>>>  		} else if (GRAPHICS_VER(i915) == 12) {
>>>  			perf->ops.is_valid_b_counter_reg =
>>>  				gen12_is_valid_b_counter_addr;
>>> @@ -4645,9 +4742,6 @@ void i915_perf_init(struct drm_i915_private *i915)
>>>  			perf->ops.enable_metric_set = gen12_enable_metric_set;
>>>  			perf->ops.disable_metric_set = gen12_disable_metric_set;
>>>  			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
>>> -
>>> -			perf->ctx_flexeu0_offset = 0;
>>> -			perf->ctx_oactxctrl_offset = 0x144;
>>>  		}
>>>  	}
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>>> index f31c9f13a9fc..0ef3562ff4aa 100644
>>> --- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>>> +++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
>>> @@ -97,7 +97,7 @@
>>>  #define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
>>>  #define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
>>>
>>> -#define GEN12_OACTXCONTROL _MMIO(0x2360)
>>> +#define GEN12_OACTXCONTROL(base) _MMIO((base) + 0x360)
>>>  #define GEN12_OAR_OASTATUS _MMIO(0x2968)
>>>
>>>  /* Gen12 OAG unit */
>>
>>-- 
>>Jani Nikula, Intel Open Source Graphics Center
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d4e9702d3c8e..f50ea92910d9 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -187,6 +187,10 @@ 
 #define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
 #define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
 
+#define MI_OPCODE(x)		(((x) >> 23) & 0x3f)
+#define IS_MI_LRI_CMD(x)	(MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0)))
+#define MI_LRI_LEN(x)		(((x) & 0xff) + 1)
+
 /*
  * 3D instructions used by the kernel
  */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index cd57b5836386..b292aa39633e 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1358,6 +1358,68 @@  static int gen12_get_render_context_id(struct i915_perf_stream *stream)
 	return 0;
 }
 
+#define valid_oactxctrl_offset(x) ((x) && (x) != U32_MAX)
+static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
+{
+	u32 idx = *offset;
+	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
+	bool found = false;
+
+	idx++;
+	for (; idx < len; idx += 2) {
+		if (state[idx] == reg) {
+			found = true;
+			break;
+		}
+	}
+
+	*offset = idx;
+	return found;
+}
+
+static u32 oa_context_image_offset(struct intel_context *ce, u32 reg)
+{
+	u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4;
+	u32 *state = ce->lrc_reg_state;
+
+	for (offset = 0; offset < len; ) {
+		if (IS_MI_LRI_CMD(state[offset])) {
+			if (oa_find_reg_in_lri(state, reg, &offset, len))
+				break;
+		} else {
+			offset++;
+		}
+	}
+
+	return offset < len ? offset : U32_MAX;
+}
+
+static int set_oa_ctx_ctrl_offset(struct intel_context *ce)
+{
+	i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base);
+	struct i915_perf *perf = &ce->engine->i915->perf;
+	u32 offset = perf->ctx_oactxctrl_offset;
+
+	/* Do this only once. Failure is stored as offset of U32_MAX */
+	if (offset)
+		goto exit;
+
+	offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg));
+	perf->ctx_oactxctrl_offset = offset;
+
+	drm_dbg(&ce->engine->i915->drm,
+		"%s oa ctx control at 0x%08x dword offset\n",
+		ce->engine->name, offset);
+
+exit:
+	return valid_oactxctrl_offset(offset) ? 0 : -ENODEV;
+}
+
+static bool engine_supports_mi_query(struct intel_engine_cs *engine)
+{
+	return engine->class == RENDER_CLASS;
+}
+
 /**
  * oa_get_render_ctx_id - determine and hold ctx hw id
  * @stream: An i915-perf stream opened for OA metrics
@@ -1377,6 +1439,21 @@  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
+	if (engine_supports_mi_query(stream->engine)) {
+		/*
+		 * We are enabling perf query here. If we don't find the context
+		 * offset here, just return an error.
+		 */
+		ret = set_oa_ctx_ctrl_offset(ce);
+		if (ret) {
+			intel_context_unpin(ce);
+			drm_err(&stream->perf->i915->drm,
+				"Enabling perf query failed for %s\n",
+				stream->engine->name);
+			return ret;
+		}
+	}
+
 	switch (GRAPHICS_VER(ce->engine->i915)) {
 	case 7: {
 		/*
@@ -2408,10 +2485,11 @@  static int gen12_configure_oar_context(struct i915_perf_stream *stream,
 	int err;
 	struct intel_context *ce = stream->pinned_ctx;
 	u32 format = stream->oa_buffer.format;
+	u32 offset = stream->perf->ctx_oactxctrl_offset;
 	struct flex regs_context[] = {
 		{
 			GEN8_OACTXCONTROL,
-			stream->perf->ctx_oactxctrl_offset + 1,
+			offset + 1,
 			active ? GEN8_OA_COUNTER_RESUME : 0,
 		},
 	};
@@ -2436,15 +2514,18 @@  static int gen12_configure_oar_context(struct i915_perf_stream *stream,
 		},
 	};
 
-	/* Modify the context image of pinned context with regs_context*/
-	err = intel_context_lock_pinned(ce);
-	if (err)
-		return err;
+	/* Modify the context image of pinned context with regs_context */
+	if (valid_oactxctrl_offset(offset)) {
+		err = intel_context_lock_pinned(ce);
+		if (err)
+			return err;
 
-	err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
-	intel_context_unlock_pinned(ce);
-	if (err)
-		return err;
+		err = gen8_modify_context(ce, regs_context,
+					  ARRAY_SIZE(regs_context));
+		intel_context_unlock_pinned(ce);
+		if (err)
+			return err;
+	}
 
 	/* Apply regs_lri using LRI with pinned context */
 	return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active);
@@ -2566,6 +2647,7 @@  lrc_configure_all_contexts(struct i915_perf_stream *stream,
 			   const struct i915_oa_config *oa_config,
 			   struct i915_active *active)
 {
+	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
 	/* The MMIO offsets for Flex EU registers aren't contiguous */
 	const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
 #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
@@ -2576,7 +2658,7 @@  lrc_configure_all_contexts(struct i915_perf_stream *stream,
 		},
 		{
 			GEN8_OACTXCONTROL,
-			stream->perf->ctx_oactxctrl_offset + 1,
+			ctx_oactxctrl + 1,
 		},
 		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
 		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
@@ -4545,6 +4627,37 @@  static void oa_init_supported_formats(struct i915_perf *perf)
 	}
 }
 
+static void i915_perf_init_info(struct drm_i915_private *i915)
+{
+	struct i915_perf *perf = &i915->perf;
+
+	switch (GRAPHICS_VER(i915)) {
+	case 8:
+		perf->ctx_oactxctrl_offset = 0x120;
+		perf->ctx_flexeu0_offset = 0x2ce;
+		perf->gen8_valid_ctx_bit = BIT(25);
+		break;
+	case 9:
+		perf->ctx_oactxctrl_offset = 0x128;
+		perf->ctx_flexeu0_offset = 0x3de;
+		perf->gen8_valid_ctx_bit = BIT(16);
+		break;
+	case 11:
+		perf->ctx_oactxctrl_offset = 0x124;
+		perf->ctx_flexeu0_offset = 0x78e;
+		perf->gen8_valid_ctx_bit = BIT(16);
+		break;
+	case 12:
+		/*
+		 * Calculate offset at runtime in oa_pin_context for gen12 and
+		 * cache the value in perf->ctx_oactxctrl_offset.
+		 */
+		break;
+	default:
+		MISSING_CASE(GRAPHICS_VER(i915));
+	}
+}
+
 /**
  * i915_perf_init - initialize i915-perf state on module bind
  * @i915: i915 device instance
@@ -4583,6 +4696,7 @@  void i915_perf_init(struct drm_i915_private *i915)
 		 * execlist mode by default.
 		 */
 		perf->ops.read = gen8_oa_read;
+		i915_perf_init_info(i915);
 
 		if (IS_GRAPHICS_VER(i915, 8, 9)) {
 			perf->ops.is_valid_b_counter_reg =
@@ -4602,18 +4716,6 @@  void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen8_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
-
-			if (GRAPHICS_VER(i915) == 8) {
-				perf->ctx_oactxctrl_offset = 0x120;
-				perf->ctx_flexeu0_offset = 0x2ce;
-
-				perf->gen8_valid_ctx_bit = BIT(25);
-			} else {
-				perf->ctx_oactxctrl_offset = 0x128;
-				perf->ctx_flexeu0_offset = 0x3de;
-
-				perf->gen8_valid_ctx_bit = BIT(16);
-			}
 		} else if (GRAPHICS_VER(i915) == 11) {
 			perf->ops.is_valid_b_counter_reg =
 				gen7_is_valid_b_counter_addr;
@@ -4627,11 +4729,6 @@  void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen11_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
-
-			perf->ctx_oactxctrl_offset = 0x124;
-			perf->ctx_flexeu0_offset = 0x78e;
-
-			perf->gen8_valid_ctx_bit = BIT(16);
 		} else if (GRAPHICS_VER(i915) == 12) {
 			perf->ops.is_valid_b_counter_reg =
 				gen12_is_valid_b_counter_addr;
@@ -4645,9 +4742,6 @@  void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen12_enable_metric_set;
 			perf->ops.disable_metric_set = gen12_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
-
-			perf->ctx_flexeu0_offset = 0;
-			perf->ctx_oactxctrl_offset = 0x144;
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
index f31c9f13a9fc..0ef3562ff4aa 100644
--- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
+++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
@@ -97,7 +97,7 @@ 
 #define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
 #define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
 
-#define GEN12_OACTXCONTROL _MMIO(0x2360)
+#define GEN12_OACTXCONTROL(base) _MMIO((base) + 0x360)
 #define GEN12_OAR_OASTATUS _MMIO(0x2968)
 
 /* Gen12 OAG unit */