diff mbox series

[2/2] drm/i915: Consolidate TLB invalidation flow

Message ID 20221213123917.4066375-2-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/i915: fix TLB invalidation for Gen12.50 video and compute engines | expand

Commit Message

Tvrtko Ursulin Dec. 13, 2022, 12:39 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

As the logic for selecting the register and corresponsing values grew, the
code become a bit unsightly. Consolidate by storing the required values at
engine init time in the engine itself, and by doing so minimise the amount
of invariant platform and engine checks during each and every TLB
invalidation.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Andrzej Hajda <andrzej.hajda@intel.com>
---
I think this looks nicer, but I don't really want to merge any flavour of
consolidation until we get IGT coverage for the issue upstreamed.
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    |  81 ++++++++++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  16 +++
 drivers/gpu/drm/i915/gt/intel_gt.c           | 126 ++++---------------
 3 files changed, 120 insertions(+), 103 deletions(-)

Comments

Andrzej Hajda Dec. 13, 2022, 2:52 p.m. UTC | #1
On 13.12.2022 13:39, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> As the logic for selecting the register and corresponsing values grew, the
> code become a bit unsightly. Consolidate by storing the required values at
> engine init time in the engine itself, and by doing so minimise the amount
> of invariant platform and engine checks during each and every TLB
> invalidation.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
> ---
> I think this looks nicer, but I don't really want to merge any flavour of
> consolidation until we get IGT coverage for the issue upstreamed.


Yep, the important is to have 1st patch merged.


> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c    |  81 ++++++++++++
>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  16 +++
>   drivers/gpu/drm/i915/gt/intel_gt.c           | 126 ++++---------------
>   3 files changed, 120 insertions(+), 103 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 99c4b866addd..97cdd9853e38 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1143,6 +1143,84 @@ static int init_status_page(struct intel_engine_cs *engine)
>   	return ret;
>   }
>   
> +static bool intel_engine_init_tlb_invalidation(struct intel_engine_cs *engine)
> +{
> +	static const union intel_engine_tlb_inv_reg gen8_regs[] = {
> +		[RENDER_CLASS].reg		= GEN8_RTCR,
> +		[VIDEO_DECODE_CLASS].reg	= GEN8_M1TCR, /* , GEN8_M2TCR */
> +		[VIDEO_ENHANCEMENT_CLASS].reg	= GEN8_VTCR,
> +		[COPY_ENGINE_CLASS].reg		= GEN8_BTCR,
> +	};
> +	static const union intel_engine_tlb_inv_reg gen12_regs[] = {
> +		[RENDER_CLASS].reg		= GEN12_GFX_TLB_INV_CR,
> +		[VIDEO_DECODE_CLASS].reg	= GEN12_VD_TLB_INV_CR,
> +		[VIDEO_ENHANCEMENT_CLASS].reg	= GEN12_VE_TLB_INV_CR,
> +		[COPY_ENGINE_CLASS].reg		= GEN12_BLT_TLB_INV_CR,
> +		[COMPUTE_CLASS].reg		= GEN12_COMPCTX_TLB_INV_CR,
> +	};
> +	static const union intel_engine_tlb_inv_reg xehp_regs[] = {
> +		[RENDER_CLASS].mcr_reg		  = XEHP_GFX_TLB_INV_CR,
> +		[VIDEO_DECODE_CLASS].mcr_reg	  = XEHP_VD_TLB_INV_CR,
> +		[VIDEO_ENHANCEMENT_CLASS].mcr_reg = XEHP_VE_TLB_INV_CR,
> +		[COPY_ENGINE_CLASS].mcr_reg	  = XEHP_BLT_TLB_INV_CR,
> +		[COMPUTE_CLASS].mcr_reg		  = XEHP_COMPCTX_TLB_INV_CR,
> +	};
> +	struct drm_i915_private *i915 = engine->i915;
> +	const union intel_engine_tlb_inv_reg *regs;
> +	union intel_engine_tlb_inv_reg reg;
> +	unsigned int class = engine->class;
> +	unsigned int num = 0;
> +	u32 val;
> +
> +	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> +		regs = xehp_regs;
> +		num = ARRAY_SIZE(xehp_regs);
> +	} else if (GRAPHICS_VER(i915) == 12) {
> +		regs = gen12_regs;
> +		num = ARRAY_SIZE(gen12_regs);
> +	} else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {

"GRAPHICS_VER(i915) <= 11" seems redundant.

> +		regs = gen8_regs;
> +		num = ARRAY_SIZE(gen8_regs);
> +	} else if (GRAPHICS_VER(i915) < 8) {

ditto

> +		return false;
> +	}
> +
> +	if (drm_WARN_ONCE(&i915->drm, !num,
> +			  "Platform does not implement TLB invalidation!"))
> +		return false;

It never happens.

> +
> +	if (drm_WARN_ON_ONCE(&i915->drm,
> +			     class >= num ||
> +			     (!regs[class].reg.reg &&
> +			      !regs[class].mcr_reg.reg)))
> +		return false;
> +
> +	reg = regs[class];
> +
> +	if (GRAPHICS_VER(i915) == 8 && class == VIDEO_DECODE_CLASS) {
> +		reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
> +		val = 0;
> +	} else {
> +		val = engine->instance;
> +	}
> +
> +	val = BIT(val);
> +
> +	engine->tlb_inv.mcr = regs == xehp_regs;
> +	engine->tlb_inv.reg = reg;
> +	engine->tlb_inv.done = val;
> +
> +	if (GRAPHICS_VER(i915) >= 12 &&
> +	    (engine->class == VIDEO_DECODE_CLASS ||
> +	     engine->class == VIDEO_ENHANCEMENT_CLASS ||
> +	     engine->class == COMPUTE_CLASS))
> +		engine->tlb_inv.request = _MASKED_BIT_ENABLE(val);
> +	else
> +		engine->tlb_inv.request = val;
> +
> +	return true;
> +}
> +
>   static int engine_setup_common(struct intel_engine_cs *engine)
>   {
>   	int err;
> @@ -1182,6 +1260,9 @@ static int engine_setup_common(struct intel_engine_cs *engine)
>   	intel_engine_init_whitelist(engine);
>   	intel_engine_init_ctx_wa(engine);
>   
> +	if (intel_engine_init_tlb_invalidation(engine))
> +		engine->flags |= I915_ENGINE_HAS_TLB_INVALIDATION;
> +
>   	if (GRAPHICS_VER(engine->i915) >= 12)
>   		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 4fd54fb8810f..8df4a09a6022 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -341,6 +341,19 @@ struct intel_engine_guc_stats {
>   	u64 start_gt_clk;
>   };
>   
> +union intel_engine_tlb_inv_reg {
> +	i915_reg_t	reg;
> +	i915_mcr_reg_t	mcr_reg;
> +};
> +
> +struct intel_engine_tlb_inv
> +{
> +	bool mcr;
> +	union intel_engine_tlb_inv_reg reg;
> +	u32 request;
> +	u32 done;
> +};
> +
>   struct intel_engine_cs {
>   	struct drm_i915_private *i915;
>   	struct intel_gt *gt;
> @@ -372,6 +385,8 @@ struct intel_engine_cs {
>   	u32 context_size;
>   	u32 mmio_base;
>   
> +	struct intel_engine_tlb_inv tlb_inv;
> +
>   	/*
>   	 * Some w/a require forcewake to be held (which prevents RC6) while
>   	 * a particular engine is active. If so, we set fw_domain to which
> @@ -556,6 +571,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
>   #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
>   #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
> +#define I915_ENGINE_HAS_TLB_INVALIDATION BIT(13)
>   	unsigned int flags;
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 7eeee5a7cb33..df7afff16fd6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -983,36 +983,6 @@ void intel_gt_info_print(const struct intel_gt_info *info,
>   	intel_sseu_dump(&info->sseu, p);
>   }
>   
> -struct reg_and_bit {
> -	union {
> -		i915_reg_t reg;
> -		i915_mcr_reg_t mcr_reg;
> -	};
> -	u32 bit;
> -};
> -
> -static struct reg_and_bit
> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
> -		const i915_reg_t *regs, const unsigned int num)
> -{
> -	const unsigned int class = engine->class;
> -	struct reg_and_bit rb = { };
> -
> -	if (drm_WARN_ON_ONCE(&engine->i915->drm,
> -			     class >= num || !regs[class].reg))
> -		return rb;
> -
> -	rb.reg = regs[class];
> -	if (gen8 && class == VIDEO_DECODE_CLASS)
> -		rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
> -	else
> -		rb.bit = engine->instance;
> -
> -	rb.bit = BIT(rb.bit);
> -
> -	return rb;
> -}
> -
>   /*
>    * HW architecture suggest typical invalidation time at 40us,
>    * with pessimistic cases up to 100us and a recommendation to
> @@ -1026,14 +996,20 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>    * but are now considered MCR registers.  Since they exist within a GAM range,
>    * the primary instance of the register rolls up the status from each unit.
>    */
> -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
> +static int wait_for_invalidate(struct intel_engine_cs *engine)
>   {
> -	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
> -		return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
> +	if (engine->tlb_inv.mcr)
> +		return intel_gt_mcr_wait_for_reg(engine->gt,
> +						 engine->tlb_inv.reg.mcr_reg,
> +						 engine->tlb_inv.done,
> +						 0,
>   						 TLB_INVAL_TIMEOUT_US,
>   						 TLB_INVAL_TIMEOUT_MS);
>   	else
> -		return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0,
> +		return __intel_wait_for_register_fw(engine->gt->uncore,
> +						    engine->tlb_inv.reg.reg,
> +						    engine->tlb_inv.done,
> +						    0,
>   						    TLB_INVAL_TIMEOUT_US,
>   						    TLB_INVAL_TIMEOUT_MS,
>   						    NULL);
> @@ -1041,50 +1017,14 @@ static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
>   
>   static void mmio_invalidate_full(struct intel_gt *gt)
>   {
> -	static const i915_reg_t gen8_regs[] = {
> -		[RENDER_CLASS]			= GEN8_RTCR,
> -		[VIDEO_DECODE_CLASS]		= GEN8_M1TCR, /* , GEN8_M2TCR */
> -		[VIDEO_ENHANCEMENT_CLASS]	= GEN8_VTCR,
> -		[COPY_ENGINE_CLASS]		= GEN8_BTCR,
> -	};
> -	static const i915_reg_t gen12_regs[] = {
> -		[RENDER_CLASS]			= GEN12_GFX_TLB_INV_CR,
> -		[VIDEO_DECODE_CLASS]		= GEN12_VD_TLB_INV_CR,
> -		[VIDEO_ENHANCEMENT_CLASS]	= GEN12_VE_TLB_INV_CR,
> -		[COPY_ENGINE_CLASS]		= GEN12_BLT_TLB_INV_CR,
> -		[COMPUTE_CLASS]			= GEN12_COMPCTX_TLB_INV_CR,
> -	};
> -	static const i915_mcr_reg_t xehp_regs[] = {
> -		[RENDER_CLASS]			= XEHP_GFX_TLB_INV_CR,
> -		[VIDEO_DECODE_CLASS]		= XEHP_VD_TLB_INV_CR,
> -		[VIDEO_ENHANCEMENT_CLASS]	= XEHP_VE_TLB_INV_CR,
> -		[COPY_ENGINE_CLASS]		= XEHP_BLT_TLB_INV_CR,
> -		[COMPUTE_CLASS]			= XEHP_COMPCTX_TLB_INV_CR,
> -	};
>   	struct drm_i915_private *i915 = gt->i915;
>   	struct intel_uncore *uncore = gt->uncore;
>   	struct intel_engine_cs *engine;
>   	intel_engine_mask_t awake, tmp;
>   	enum intel_engine_id id;
> -	const i915_reg_t *regs;
> -	unsigned int num = 0;
>   	unsigned long flags;
>   
> -	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> -		regs = NULL;
> -		num = ARRAY_SIZE(xehp_regs);
> -	} else if (GRAPHICS_VER(i915) == 12) {
> -		regs = gen12_regs;
> -		num = ARRAY_SIZE(gen12_regs);
> -	} else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
> -		regs = gen8_regs;
> -		num = ARRAY_SIZE(gen8_regs);
> -	} else if (GRAPHICS_VER(i915) < 8) {
> -		return;
> -	}
> -
> -	if (drm_WARN_ONCE(&i915->drm, !num,
> -			  "Platform does not implement TLB invalidation!"))
> +	if (GRAPHICS_VER(i915) < 8)
>   		return;
>   
>   	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
> @@ -1094,33 +1034,22 @@ static void mmio_invalidate_full(struct intel_gt *gt)
>   
>   	awake = 0;
>   	for_each_engine(engine, gt, id) {
> -		struct reg_and_bit rb;
> -
>   		if (!intel_engine_pm_is_awake(engine))
>   			continue;
>   
> -		if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> -			u32 val = BIT(engine->instance);
> +		if (drm_WARN_ON_ONCE(&i915->drm,
> +				     !(engine->flags & I915_ENGINE_HAS_TLB_INVALIDATION)))
> +			continue;

Hmm, can this flag change dynamically? If not why not put it in 
initialization phase.

>   
> -			if (engine->class == VIDEO_DECODE_CLASS ||
> -			    engine->class == VIDEO_ENHANCEMENT_CLASS ||
> -			    engine->class == COMPUTE_CLASS)
> -				val = _MASKED_BIT_ENABLE(val);
> +		if (engine->tlb_inv.mcr)
>   			intel_gt_mcr_multicast_write_fw(gt,
> -							xehp_regs[engine->class],
> -							val);
> -		} else {
> -			rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> -			if (!i915_mmio_reg_offset(rb.reg))
> -				continue;
> -
> -			if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS ||
> -			    engine->class == VIDEO_ENHANCEMENT_CLASS ||
> -			    engine->class == COMPUTE_CLASS))
> -				rb.bit = _MASKED_BIT_ENABLE(rb.bit);
> -
> -			intel_uncore_write_fw(uncore, rb.reg, rb.bit);
> -		}
> +							engine->tlb_inv.reg.mcr_reg,
> +							engine->tlb_inv.request);
> +		else
> +			intel_uncore_write_fw(uncore,
> +					      engine->tlb_inv.reg.reg,
> +					      engine->tlb_inv.request);
> +
>   		awake |= engine->mask;
>   	}
>   
> @@ -1139,16 +1068,7 @@ static void mmio_invalidate_full(struct intel_gt *gt)
>   	intel_gt_mcr_unlock(gt, flags);
>   
>   	for_each_engine_masked(engine, gt, awake, tmp) {
> -		struct reg_and_bit rb;
> -
> -		if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> -			rb.mcr_reg = xehp_regs[engine->class];
> -			rb.bit = BIT(engine->instance);
> -		} else {
> -			rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> -		}
> -
> -		if (wait_for_invalidate(gt, rb))
> +		if (wait_for_invalidate(engine))
>   			drm_err_ratelimited(&gt->i915->drm,
>   					    "%s TLB invalidation did not complete in %ums!\n",
>   					    engine->name, TLB_INVAL_TIMEOUT_MS);

With minor comments addressed:

Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>

Regards
Andrzej
Tvrtko Ursulin Dec. 13, 2022, 3:22 p.m. UTC | #2
On 13/12/2022 14:52, Andrzej Hajda wrote:
> On 13.12.2022 13:39, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> As the logic for selecting the register and corresponsing values grew, 
>> the
>> code become a bit unsightly. Consolidate by storing the required 
>> values at
>> engine init time in the engine itself, and by doing so minimise the 
>> amount
>> of invariant platform and engine checks during each and every TLB
>> invalidation.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
>> ---
>> I think this looks nicer, but I don't really want to merge any flavour of
>> consolidation until we get IGT coverage for the issue upstreamed.
> 
> 
> Yep, the important is to have 1st patch merged.

Agreed.

1)
Can you send it standalone so it gets CI results and can get merged?

2)
Could you check internally where did gem_exec_tlb get stuck?

>> ---
>>   drivers/gpu/drm/i915/gt/intel_engine_cs.c    |  81 ++++++++++++
>>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  16 +++
>>   drivers/gpu/drm/i915/gt/intel_gt.c           | 126 ++++---------------
>>   3 files changed, 120 insertions(+), 103 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> index 99c4b866addd..97cdd9853e38 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> @@ -1143,6 +1143,84 @@ static int init_status_page(struct 
>> intel_engine_cs *engine)
>>       return ret;
>>   }
>> +static bool intel_engine_init_tlb_invalidation(struct intel_engine_cs 
>> *engine)
>> +{
>> +    static const union intel_engine_tlb_inv_reg gen8_regs[] = {
>> +        [RENDER_CLASS].reg        = GEN8_RTCR,
>> +        [VIDEO_DECODE_CLASS].reg    = GEN8_M1TCR, /* , GEN8_M2TCR */
>> +        [VIDEO_ENHANCEMENT_CLASS].reg    = GEN8_VTCR,
>> +        [COPY_ENGINE_CLASS].reg        = GEN8_BTCR,
>> +    };
>> +    static const union intel_engine_tlb_inv_reg gen12_regs[] = {
>> +        [RENDER_CLASS].reg        = GEN12_GFX_TLB_INV_CR,
>> +        [VIDEO_DECODE_CLASS].reg    = GEN12_VD_TLB_INV_CR,
>> +        [VIDEO_ENHANCEMENT_CLASS].reg    = GEN12_VE_TLB_INV_CR,
>> +        [COPY_ENGINE_CLASS].reg        = GEN12_BLT_TLB_INV_CR,
>> +        [COMPUTE_CLASS].reg        = GEN12_COMPCTX_TLB_INV_CR,
>> +    };
>> +    static const union intel_engine_tlb_inv_reg xehp_regs[] = {
>> +        [RENDER_CLASS].mcr_reg          = XEHP_GFX_TLB_INV_CR,
>> +        [VIDEO_DECODE_CLASS].mcr_reg      = XEHP_VD_TLB_INV_CR,
>> +        [VIDEO_ENHANCEMENT_CLASS].mcr_reg = XEHP_VE_TLB_INV_CR,
>> +        [COPY_ENGINE_CLASS].mcr_reg      = XEHP_BLT_TLB_INV_CR,
>> +        [COMPUTE_CLASS].mcr_reg          = XEHP_COMPCTX_TLB_INV_CR,
>> +    };
>> +    struct drm_i915_private *i915 = engine->i915;
>> +    const union intel_engine_tlb_inv_reg *regs;
>> +    union intel_engine_tlb_inv_reg reg;
>> +    unsigned int class = engine->class;
>> +    unsigned int num = 0;
>> +    u32 val;
>> +
>> +    if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>> +        regs = xehp_regs;
>> +        num = ARRAY_SIZE(xehp_regs);
>> +    } else if (GRAPHICS_VER(i915) == 12) {
>> +        regs = gen12_regs;
>> +        num = ARRAY_SIZE(gen12_regs);
>> +    } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
> 
> "GRAPHICS_VER(i915) <= 11" seems redundant.

Code movement... Fixes: "patch which added >= 12,50" to clean just that 
part up. Before it was deliberately made to hit the below warn on so 
that any new platform added requires a human checking if the registers 
are still same and compatible. (Instead of assuming future platforms 
remain compatible - which is IMO too big risk to take.)

> 
>> +        regs = gen8_regs;
>> +        num = ARRAY_SIZE(gen8_regs);
>> +    } else if (GRAPHICS_VER(i915) < 8) {
> 
> ditto

How? We want to do nothing before gen 8.

>> +        return false;
>> +    }
>> +
>> +    if (drm_WARN_ONCE(&i915->drm, !num,
>> +              "Platform does not implement TLB invalidation!"))
>> +        return false;
> 
> It never happens.

As above a) it's just code movement and b) it would be safer if it could 
happen.

>> +
>> +    if (drm_WARN_ON_ONCE(&i915->drm,
>> +                 class >= num ||
>> +                 (!regs[class].reg.reg &&
>> +                  !regs[class].mcr_reg.reg)))
>> +        return false;
>> +
>> +    reg = regs[class];
>> +
>> +    if (GRAPHICS_VER(i915) == 8 && class == VIDEO_DECODE_CLASS) {
>> +        reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>> +        val = 0;
>> +    } else {
>> +        val = engine->instance;
>> +    }
>> +
>> +    val = BIT(val);
>> +
>> +    engine->tlb_inv.mcr = regs == xehp_regs;
>> +    engine->tlb_inv.reg = reg;
>> +    engine->tlb_inv.done = val;
>> +
>> +    if (GRAPHICS_VER(i915) >= 12 &&
>> +        (engine->class == VIDEO_DECODE_CLASS ||
>> +         engine->class == VIDEO_ENHANCEMENT_CLASS ||
>> +         engine->class == COMPUTE_CLASS))
>> +        engine->tlb_inv.request = _MASKED_BIT_ENABLE(val);
>> +    else
>> +        engine->tlb_inv.request = val;
>> +
>> +    return true;
>> +}
>> +
>>   static int engine_setup_common(struct intel_engine_cs *engine)
>>   {
>>       int err;
>> @@ -1182,6 +1260,9 @@ static int engine_setup_common(struct 
>> intel_engine_cs *engine)
>>       intel_engine_init_whitelist(engine);
>>       intel_engine_init_ctx_wa(engine);
>> +    if (intel_engine_init_tlb_invalidation(engine))
>> +        engine->flags |= I915_ENGINE_HAS_TLB_INVALIDATION;
>> +
>>       if (GRAPHICS_VER(engine->i915) >= 12)
>>           engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> index 4fd54fb8810f..8df4a09a6022 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> @@ -341,6 +341,19 @@ struct intel_engine_guc_stats {
>>       u64 start_gt_clk;
>>   };
>> +union intel_engine_tlb_inv_reg {
>> +    i915_reg_t    reg;
>> +    i915_mcr_reg_t    mcr_reg;
>> +};
>> +
>> +struct intel_engine_tlb_inv
>> +{
>> +    bool mcr;
>> +    union intel_engine_tlb_inv_reg reg;
>> +    u32 request;
>> +    u32 done;
>> +};
>> +
>>   struct intel_engine_cs {
>>       struct drm_i915_private *i915;
>>       struct intel_gt *gt;
>> @@ -372,6 +385,8 @@ struct intel_engine_cs {
>>       u32 context_size;
>>       u32 mmio_base;
>> +    struct intel_engine_tlb_inv tlb_inv;
>> +
>>       /*
>>        * Some w/a require forcewake to be held (which prevents RC6) while
>>        * a particular engine is active. If so, we set fw_domain to which
>> @@ -556,6 +571,7 @@ struct intel_engine_cs {
>>   #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
>>   #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
>>   #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
>> +#define I915_ENGINE_HAS_TLB_INVALIDATION BIT(13)
>>       unsigned int flags;
>>       /*
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
>> b/drivers/gpu/drm/i915/gt/intel_gt.c
>> index 7eeee5a7cb33..df7afff16fd6 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>> @@ -983,36 +983,6 @@ void intel_gt_info_print(const struct 
>> intel_gt_info *info,
>>       intel_sseu_dump(&info->sseu, p);
>>   }
>> -struct reg_and_bit {
>> -    union {
>> -        i915_reg_t reg;
>> -        i915_mcr_reg_t mcr_reg;
>> -    };
>> -    u32 bit;
>> -};
>> -
>> -static struct reg_and_bit
>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>> -        const i915_reg_t *regs, const unsigned int num)
>> -{
>> -    const unsigned int class = engine->class;
>> -    struct reg_and_bit rb = { };
>> -
>> -    if (drm_WARN_ON_ONCE(&engine->i915->drm,
>> -                 class >= num || !regs[class].reg))
>> -        return rb;
>> -
>> -    rb.reg = regs[class];
>> -    if (gen8 && class == VIDEO_DECODE_CLASS)
>> -        rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>> -    else
>> -        rb.bit = engine->instance;
>> -
>> -    rb.bit = BIT(rb.bit);
>> -
>> -    return rb;
>> -}
>> -
>>   /*
>>    * HW architecture suggest typical invalidation time at 40us,
>>    * with pessimistic cases up to 100us and a recommendation to
>> @@ -1026,14 +996,20 @@ get_reg_and_bit(const struct intel_engine_cs 
>> *engine, const bool gen8,
>>    * but are now considered MCR registers.  Since they exist within a 
>> GAM range,
>>    * the primary instance of the register rolls up the status from 
>> each unit.
>>    */
>> -static int wait_for_invalidate(struct intel_gt *gt, struct 
>> reg_and_bit rb)
>> +static int wait_for_invalidate(struct intel_engine_cs *engine)
>>   {
>> -    if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
>> -        return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
>> +    if (engine->tlb_inv.mcr)
>> +        return intel_gt_mcr_wait_for_reg(engine->gt,
>> +                         engine->tlb_inv.reg.mcr_reg,
>> +                         engine->tlb_inv.done,
>> +                         0,
>>                            TLB_INVAL_TIMEOUT_US,
>>                            TLB_INVAL_TIMEOUT_MS);
>>       else
>> -        return __intel_wait_for_register_fw(gt->uncore, rb.reg, 
>> rb.bit, 0,
>> +        return __intel_wait_for_register_fw(engine->gt->uncore,
>> +                            engine->tlb_inv.reg.reg,
>> +                            engine->tlb_inv.done,
>> +                            0,
>>                               TLB_INVAL_TIMEOUT_US,
>>                               TLB_INVAL_TIMEOUT_MS,
>>                               NULL);
>> @@ -1041,50 +1017,14 @@ static int wait_for_invalidate(struct intel_gt 
>> *gt, struct reg_and_bit rb)
>>   static void mmio_invalidate_full(struct intel_gt *gt)
>>   {
>> -    static const i915_reg_t gen8_regs[] = {
>> -        [RENDER_CLASS]            = GEN8_RTCR,
>> -        [VIDEO_DECODE_CLASS]        = GEN8_M1TCR, /* , GEN8_M2TCR */
>> -        [VIDEO_ENHANCEMENT_CLASS]    = GEN8_VTCR,
>> -        [COPY_ENGINE_CLASS]        = GEN8_BTCR,
>> -    };
>> -    static const i915_reg_t gen12_regs[] = {
>> -        [RENDER_CLASS]            = GEN12_GFX_TLB_INV_CR,
>> -        [VIDEO_DECODE_CLASS]        = GEN12_VD_TLB_INV_CR,
>> -        [VIDEO_ENHANCEMENT_CLASS]    = GEN12_VE_TLB_INV_CR,
>> -        [COPY_ENGINE_CLASS]        = GEN12_BLT_TLB_INV_CR,
>> -        [COMPUTE_CLASS]            = GEN12_COMPCTX_TLB_INV_CR,
>> -    };
>> -    static const i915_mcr_reg_t xehp_regs[] = {
>> -        [RENDER_CLASS]            = XEHP_GFX_TLB_INV_CR,
>> -        [VIDEO_DECODE_CLASS]        = XEHP_VD_TLB_INV_CR,
>> -        [VIDEO_ENHANCEMENT_CLASS]    = XEHP_VE_TLB_INV_CR,
>> -        [COPY_ENGINE_CLASS]        = XEHP_BLT_TLB_INV_CR,
>> -        [COMPUTE_CLASS]            = XEHP_COMPCTX_TLB_INV_CR,
>> -    };
>>       struct drm_i915_private *i915 = gt->i915;
>>       struct intel_uncore *uncore = gt->uncore;
>>       struct intel_engine_cs *engine;
>>       intel_engine_mask_t awake, tmp;
>>       enum intel_engine_id id;
>> -    const i915_reg_t *regs;
>> -    unsigned int num = 0;
>>       unsigned long flags;
>> -    if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>> -        regs = NULL;
>> -        num = ARRAY_SIZE(xehp_regs);
>> -    } else if (GRAPHICS_VER(i915) == 12) {
>> -        regs = gen12_regs;
>> -        num = ARRAY_SIZE(gen12_regs);
>> -    } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
>> -        regs = gen8_regs;
>> -        num = ARRAY_SIZE(gen8_regs);
>> -    } else if (GRAPHICS_VER(i915) < 8) {
>> -        return;
>> -    }
>> -
>> -    if (drm_WARN_ONCE(&i915->drm, !num,
>> -              "Platform does not implement TLB invalidation!"))
>> +    if (GRAPHICS_VER(i915) < 8)
>>           return;
>>       intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>> @@ -1094,33 +1034,22 @@ static void mmio_invalidate_full(struct 
>> intel_gt *gt)
>>       awake = 0;
>>       for_each_engine(engine, gt, id) {
>> -        struct reg_and_bit rb;
>> -
>>           if (!intel_engine_pm_is_awake(engine))
>>               continue;
>> -        if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>> -            u32 val = BIT(engine->instance);
>> +        if (drm_WARN_ON_ONCE(&i915->drm,
>> +                     !(engine->flags & 
>> I915_ENGINE_HAS_TLB_INVALIDATION)))
>> +            continue;
> 
> Hmm, can this flag change dynamically? If not why not put it in 
> initialization phase.

Could do. In that case I couldn't have any asserts that engine->tlb_inv 
is valid but perhaps that is okay. Could just fail engine probe from 
there if register table is incomplete. Sounds better indeed, I'll change it.

> 
>> -            if (engine->class == VIDEO_DECODE_CLASS ||
>> -                engine->class == VIDEO_ENHANCEMENT_CLASS ||
>> -                engine->class == COMPUTE_CLASS)
>> -                val = _MASKED_BIT_ENABLE(val);
>> +        if (engine->tlb_inv.mcr)
>>               intel_gt_mcr_multicast_write_fw(gt,
>> -                            xehp_regs[engine->class],
>> -                            val);
>> -        } else {
>> -            rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>> -            if (!i915_mmio_reg_offset(rb.reg))
>> -                continue;
>> -
>> -            if (GRAPHICS_VER(i915) == 12 && (engine->class == 
>> VIDEO_DECODE_CLASS ||
>> -                engine->class == VIDEO_ENHANCEMENT_CLASS ||
>> -                engine->class == COMPUTE_CLASS))
>> -                rb.bit = _MASKED_BIT_ENABLE(rb.bit);
>> -
>> -            intel_uncore_write_fw(uncore, rb.reg, rb.bit);
>> -        }
>> +                            engine->tlb_inv.reg.mcr_reg,
>> +                            engine->tlb_inv.request);
>> +        else
>> +            intel_uncore_write_fw(uncore,
>> +                          engine->tlb_inv.reg.reg,
>> +                          engine->tlb_inv.request);
>> +
>>           awake |= engine->mask;
>>       }
>> @@ -1139,16 +1068,7 @@ static void mmio_invalidate_full(struct 
>> intel_gt *gt)
>>       intel_gt_mcr_unlock(gt, flags);
>>       for_each_engine_masked(engine, gt, awake, tmp) {
>> -        struct reg_and_bit rb;
>> -
>> -        if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>> -            rb.mcr_reg = xehp_regs[engine->class];
>> -            rb.bit = BIT(engine->instance);
>> -        } else {
>> -            rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>> -        }
>> -
>> -        if (wait_for_invalidate(gt, rb))
>> +        if (wait_for_invalidate(engine))
>>               drm_err_ratelimited(&gt->i915->drm,
>>                           "%s TLB invalidation did not complete in 
>> %ums!\n",
>>                           engine->name, TLB_INVAL_TIMEOUT_MS);
> 
> With minor comments addressed:
> 
> Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>

I'll send v2 at some point, please stand by.

Regards,

Tvrtko
Andrzej Hajda Dec. 14, 2022, 9:02 a.m. UTC | #3
On 13.12.2022 16:22, Tvrtko Ursulin wrote:
> 
> On 13/12/2022 14:52, Andrzej Hajda wrote:
>> On 13.12.2022 13:39, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> As the logic for selecting the register and corresponsing values 
>>> grew, the
>>> code become a bit unsightly. Consolidate by storing the required 
>>> values at
>>> engine init time in the engine itself, and by doing so minimise the 
>>> amount
>>> of invariant platform and engine checks during each and every TLB
>>> invalidation.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
>>> ---
>>> I think this looks nicer, but I don't really want to merge any 
>>> flavour of
>>> consolidation until we get IGT coverage for the issue upstreamed.
>>
>>
>> Yep, the important is to have 1st patch merged.
> 
> Agreed.
> 
> 1)
> Can you send it standalone so it gets CI results and can get merged?

Done.

> 
> 2)
> Could you check internally where did gem_exec_tlb get stuck? >
>>> ---
>>>   drivers/gpu/drm/i915/gt/intel_engine_cs.c    |  81 ++++++++++++
>>>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  16 +++
>>>   drivers/gpu/drm/i915/gt/intel_gt.c           | 126 ++++---------------
>>>   3 files changed, 120 insertions(+), 103 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
>>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> index 99c4b866addd..97cdd9853e38 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> @@ -1143,6 +1143,84 @@ static int init_status_page(struct 
>>> intel_engine_cs *engine)
>>>       return ret;
>>>   }
>>> +static bool intel_engine_init_tlb_invalidation(struct 
>>> intel_engine_cs *engine)
>>> +{
>>> +    static const union intel_engine_tlb_inv_reg gen8_regs[] = {
>>> +        [RENDER_CLASS].reg        = GEN8_RTCR,
>>> +        [VIDEO_DECODE_CLASS].reg    = GEN8_M1TCR, /* , GEN8_M2TCR */
>>> +        [VIDEO_ENHANCEMENT_CLASS].reg    = GEN8_VTCR,
>>> +        [COPY_ENGINE_CLASS].reg        = GEN8_BTCR,
>>> +    };
>>> +    static const union intel_engine_tlb_inv_reg gen12_regs[] = {
>>> +        [RENDER_CLASS].reg        = GEN12_GFX_TLB_INV_CR,
>>> +        [VIDEO_DECODE_CLASS].reg    = GEN12_VD_TLB_INV_CR,
>>> +        [VIDEO_ENHANCEMENT_CLASS].reg    = GEN12_VE_TLB_INV_CR,
>>> +        [COPY_ENGINE_CLASS].reg        = GEN12_BLT_TLB_INV_CR,
>>> +        [COMPUTE_CLASS].reg        = GEN12_COMPCTX_TLB_INV_CR,
>>> +    };
>>> +    static const union intel_engine_tlb_inv_reg xehp_regs[] = {
>>> +        [RENDER_CLASS].mcr_reg          = XEHP_GFX_TLB_INV_CR,
>>> +        [VIDEO_DECODE_CLASS].mcr_reg      = XEHP_VD_TLB_INV_CR,
>>> +        [VIDEO_ENHANCEMENT_CLASS].mcr_reg = XEHP_VE_TLB_INV_CR,
>>> +        [COPY_ENGINE_CLASS].mcr_reg      = XEHP_BLT_TLB_INV_CR,
>>> +        [COMPUTE_CLASS].mcr_reg          = XEHP_COMPCTX_TLB_INV_CR,
>>> +    };
>>> +    struct drm_i915_private *i915 = engine->i915;
>>> +    const union intel_engine_tlb_inv_reg *regs;
>>> +    union intel_engine_tlb_inv_reg reg;
>>> +    unsigned int class = engine->class;
>>> +    unsigned int num = 0;
>>> +    u32 val;
>>> +
>>> +    if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> +        regs = xehp_regs;
>>> +        num = ARRAY_SIZE(xehp_regs);
>>> +    } else if (GRAPHICS_VER(i915) == 12) {
>>> +        regs = gen12_regs;
>>> +        num = ARRAY_SIZE(gen12_regs);
>>> +    } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
>>
>> "GRAPHICS_VER(i915) <= 11" seems redundant.
> 
> Code movement... Fixes: "patch which added >= 12,50" to clean just that 
> part up. Before it was deliberately made to hit the below warn on so 
> that any new platform added requires a human checking if the registers 
> are still same and compatible. (Instead of assuming future platforms 
> remain compatible - which is IMO too big risk to take.)
> 
>>
>>> +        regs = gen8_regs;
>>> +        num = ARRAY_SIZE(gen8_regs);
>>> +    } else if (GRAPHICS_VER(i915) < 8) {
>>
>> ditto
> 
> How? We want to do nothing before gen 8.

} else {
	return false;
}

?

As I understand there are three 'rules' we want to follow:
1. start from the newest hw, end at the oldest.
2. warn for not yet supported platforms, lets assume it is ver >= 13.
3. cover all cases

Why not use >= consistently, till exhaustion:

if (ver >= 13) {
	warn
	return false;
} else if (ver >= 12.50) {
	...
} else if (ver >= 12) {
	...
} else if (ver >= 8) {
	...
} else {
	return false;
}

It looks cleaner, ordered, no redundant checks, no doubts, hopefuly 
future-proof.

Regards
Andrzej

> 
>>> +        return false;
>>> +    }
>>> +
>>> +    if (drm_WARN_ONCE(&i915->drm, !num,
>>> +              "Platform does not implement TLB invalidation!"))
>>> +        return false;
>>
>> It never happens.
> 
> As above a) it's just code movement and b) it would be safer if it could 
> happen.
> 
>>> +
>>> +    if (drm_WARN_ON_ONCE(&i915->drm,
>>> +                 class >= num ||
>>> +                 (!regs[class].reg.reg &&
>>> +                  !regs[class].mcr_reg.reg)))
>>> +        return false;
>>> +
>>> +    reg = regs[class];
>>> +
>>> +    if (GRAPHICS_VER(i915) == 8 && class == VIDEO_DECODE_CLASS) {
>>> +        reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>>> +        val = 0;
>>> +    } else {
>>> +        val = engine->instance;
>>> +    }
>>> +
>>> +    val = BIT(val);
>>> +
>>> +    engine->tlb_inv.mcr = regs == xehp_regs;
>>> +    engine->tlb_inv.reg = reg;
>>> +    engine->tlb_inv.done = val;
>>> +
>>> +    if (GRAPHICS_VER(i915) >= 12 &&
>>> +        (engine->class == VIDEO_DECODE_CLASS ||
>>> +         engine->class == VIDEO_ENHANCEMENT_CLASS ||
>>> +         engine->class == COMPUTE_CLASS))
>>> +        engine->tlb_inv.request = _MASKED_BIT_ENABLE(val);
>>> +    else
>>> +        engine->tlb_inv.request = val;
>>> +
>>> +    return true;
>>> +}
>>> +
>>>   static int engine_setup_common(struct intel_engine_cs *engine)
>>>   {
>>>       int err;
>>> @@ -1182,6 +1260,9 @@ static int engine_setup_common(struct 
>>> intel_engine_cs *engine)
>>>       intel_engine_init_whitelist(engine);
>>>       intel_engine_init_ctx_wa(engine);
>>> +    if (intel_engine_init_tlb_invalidation(engine))
>>> +        engine->flags |= I915_ENGINE_HAS_TLB_INVALIDATION;
>>> +
>>>       if (GRAPHICS_VER(engine->i915) >= 12)
>>>           engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 4fd54fb8810f..8df4a09a6022 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -341,6 +341,19 @@ struct intel_engine_guc_stats {
>>>       u64 start_gt_clk;
>>>   };
>>> +union intel_engine_tlb_inv_reg {
>>> +    i915_reg_t    reg;
>>> +    i915_mcr_reg_t    mcr_reg;
>>> +};
>>> +
>>> +struct intel_engine_tlb_inv
>>> +{
>>> +    bool mcr;
>>> +    union intel_engine_tlb_inv_reg reg;
>>> +    u32 request;
>>> +    u32 done;
>>> +};
>>> +
>>>   struct intel_engine_cs {
>>>       struct drm_i915_private *i915;
>>>       struct intel_gt *gt;
>>> @@ -372,6 +385,8 @@ struct intel_engine_cs {
>>>       u32 context_size;
>>>       u32 mmio_base;
>>> +    struct intel_engine_tlb_inv tlb_inv;
>>> +
>>>       /*
>>>        * Some w/a require forcewake to be held (which prevents RC6) 
>>> while
>>>        * a particular engine is active. If so, we set fw_domain to which
>>> @@ -556,6 +571,7 @@ struct intel_engine_cs {
>>>   #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
>>>   #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
>>>   #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
>>> +#define I915_ENGINE_HAS_TLB_INVALIDATION BIT(13)
>>>       unsigned int flags;
>>>       /*
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
>>> b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> index 7eeee5a7cb33..df7afff16fd6 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> @@ -983,36 +983,6 @@ void intel_gt_info_print(const struct 
>>> intel_gt_info *info,
>>>       intel_sseu_dump(&info->sseu, p);
>>>   }
>>> -struct reg_and_bit {
>>> -    union {
>>> -        i915_reg_t reg;
>>> -        i915_mcr_reg_t mcr_reg;
>>> -    };
>>> -    u32 bit;
>>> -};
>>> -
>>> -static struct reg_and_bit
>>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>>> -        const i915_reg_t *regs, const unsigned int num)
>>> -{
>>> -    const unsigned int class = engine->class;
>>> -    struct reg_and_bit rb = { };
>>> -
>>> -    if (drm_WARN_ON_ONCE(&engine->i915->drm,
>>> -                 class >= num || !regs[class].reg))
>>> -        return rb;
>>> -
>>> -    rb.reg = regs[class];
>>> -    if (gen8 && class == VIDEO_DECODE_CLASS)
>>> -        rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>>> -    else
>>> -        rb.bit = engine->instance;
>>> -
>>> -    rb.bit = BIT(rb.bit);
>>> -
>>> -    return rb;
>>> -}
>>> -
>>>   /*
>>>    * HW architecture suggest typical invalidation time at 40us,
>>>    * with pessimistic cases up to 100us and a recommendation to
>>> @@ -1026,14 +996,20 @@ get_reg_and_bit(const struct intel_engine_cs 
>>> *engine, const bool gen8,
>>>    * but are now considered MCR registers.  Since they exist within a 
>>> GAM range,
>>>    * the primary instance of the register rolls up the status from 
>>> each unit.
>>>    */
>>> -static int wait_for_invalidate(struct intel_gt *gt, struct 
>>> reg_and_bit rb)
>>> +static int wait_for_invalidate(struct intel_engine_cs *engine)
>>>   {
>>> -    if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
>>> -        return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
>>> +    if (engine->tlb_inv.mcr)
>>> +        return intel_gt_mcr_wait_for_reg(engine->gt,
>>> +                         engine->tlb_inv.reg.mcr_reg,
>>> +                         engine->tlb_inv.done,
>>> +                         0,
>>>                            TLB_INVAL_TIMEOUT_US,
>>>                            TLB_INVAL_TIMEOUT_MS);
>>>       else
>>> -        return __intel_wait_for_register_fw(gt->uncore, rb.reg, 
>>> rb.bit, 0,
>>> +        return __intel_wait_for_register_fw(engine->gt->uncore,
>>> +                            engine->tlb_inv.reg.reg,
>>> +                            engine->tlb_inv.done,
>>> +                            0,
>>>                               TLB_INVAL_TIMEOUT_US,
>>>                               TLB_INVAL_TIMEOUT_MS,
>>>                               NULL);
>>> @@ -1041,50 +1017,14 @@ static int wait_for_invalidate(struct 
>>> intel_gt *gt, struct reg_and_bit rb)
>>>   static void mmio_invalidate_full(struct intel_gt *gt)
>>>   {
>>> -    static const i915_reg_t gen8_regs[] = {
>>> -        [RENDER_CLASS]            = GEN8_RTCR,
>>> -        [VIDEO_DECODE_CLASS]        = GEN8_M1TCR, /* , GEN8_M2TCR */
>>> -        [VIDEO_ENHANCEMENT_CLASS]    = GEN8_VTCR,
>>> -        [COPY_ENGINE_CLASS]        = GEN8_BTCR,
>>> -    };
>>> -    static const i915_reg_t gen12_regs[] = {
>>> -        [RENDER_CLASS]            = GEN12_GFX_TLB_INV_CR,
>>> -        [VIDEO_DECODE_CLASS]        = GEN12_VD_TLB_INV_CR,
>>> -        [VIDEO_ENHANCEMENT_CLASS]    = GEN12_VE_TLB_INV_CR,
>>> -        [COPY_ENGINE_CLASS]        = GEN12_BLT_TLB_INV_CR,
>>> -        [COMPUTE_CLASS]            = GEN12_COMPCTX_TLB_INV_CR,
>>> -    };
>>> -    static const i915_mcr_reg_t xehp_regs[] = {
>>> -        [RENDER_CLASS]            = XEHP_GFX_TLB_INV_CR,
>>> -        [VIDEO_DECODE_CLASS]        = XEHP_VD_TLB_INV_CR,
>>> -        [VIDEO_ENHANCEMENT_CLASS]    = XEHP_VE_TLB_INV_CR,
>>> -        [COPY_ENGINE_CLASS]        = XEHP_BLT_TLB_INV_CR,
>>> -        [COMPUTE_CLASS]            = XEHP_COMPCTX_TLB_INV_CR,
>>> -    };
>>>       struct drm_i915_private *i915 = gt->i915;
>>>       struct intel_uncore *uncore = gt->uncore;
>>>       struct intel_engine_cs *engine;
>>>       intel_engine_mask_t awake, tmp;
>>>       enum intel_engine_id id;
>>> -    const i915_reg_t *regs;
>>> -    unsigned int num = 0;
>>>       unsigned long flags;
>>> -    if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> -        regs = NULL;
>>> -        num = ARRAY_SIZE(xehp_regs);
>>> -    } else if (GRAPHICS_VER(i915) == 12) {
>>> -        regs = gen12_regs;
>>> -        num = ARRAY_SIZE(gen12_regs);
>>> -    } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
>>> -        regs = gen8_regs;
>>> -        num = ARRAY_SIZE(gen8_regs);
>>> -    } else if (GRAPHICS_VER(i915) < 8) {
>>> -        return;
>>> -    }
>>> -
>>> -    if (drm_WARN_ONCE(&i915->drm, !num,
>>> -              "Platform does not implement TLB invalidation!"))
>>> +    if (GRAPHICS_VER(i915) < 8)
>>>           return;
>>>       intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>>> @@ -1094,33 +1034,22 @@ static void mmio_invalidate_full(struct 
>>> intel_gt *gt)
>>>       awake = 0;
>>>       for_each_engine(engine, gt, id) {
>>> -        struct reg_and_bit rb;
>>> -
>>>           if (!intel_engine_pm_is_awake(engine))
>>>               continue;
>>> -        if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> -            u32 val = BIT(engine->instance);
>>> +        if (drm_WARN_ON_ONCE(&i915->drm,
>>> +                     !(engine->flags & 
>>> I915_ENGINE_HAS_TLB_INVALIDATION)))
>>> +            continue;
>>
>> Hmm, can this flag change dynamically? If not why not put it in 
>> initialization phase.
> 
> Could do. In that case I couldn't have any asserts that engine->tlb_inv 
> is valid but perhaps that is okay. Could just fail engine probe from 
> there if register table is incomplete. Sounds better indeed, I'll change 
> it.
> 
>>
>>> -            if (engine->class == VIDEO_DECODE_CLASS ||
>>> -                engine->class == VIDEO_ENHANCEMENT_CLASS ||
>>> -                engine->class == COMPUTE_CLASS)
>>> -                val = _MASKED_BIT_ENABLE(val);
>>> +        if (engine->tlb_inv.mcr)
>>>               intel_gt_mcr_multicast_write_fw(gt,
>>> -                            xehp_regs[engine->class],
>>> -                            val);
>>> -        } else {
>>> -            rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>>> -            if (!i915_mmio_reg_offset(rb.reg))
>>> -                continue;
>>> -
>>> -            if (GRAPHICS_VER(i915) == 12 && (engine->class == 
>>> VIDEO_DECODE_CLASS ||
>>> -                engine->class == VIDEO_ENHANCEMENT_CLASS ||
>>> -                engine->class == COMPUTE_CLASS))
>>> -                rb.bit = _MASKED_BIT_ENABLE(rb.bit);
>>> -
>>> -            intel_uncore_write_fw(uncore, rb.reg, rb.bit);
>>> -        }
>>> +                            engine->tlb_inv.reg.mcr_reg,
>>> +                            engine->tlb_inv.request);
>>> +        else
>>> +            intel_uncore_write_fw(uncore,
>>> +                          engine->tlb_inv.reg.reg,
>>> +                          engine->tlb_inv.request);
>>> +
>>>           awake |= engine->mask;
>>>       }
>>> @@ -1139,16 +1068,7 @@ static void mmio_invalidate_full(struct 
>>> intel_gt *gt)
>>>       intel_gt_mcr_unlock(gt, flags);
>>>       for_each_engine_masked(engine, gt, awake, tmp) {
>>> -        struct reg_and_bit rb;
>>> -
>>> -        if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> -            rb.mcr_reg = xehp_regs[engine->class];
>>> -            rb.bit = BIT(engine->instance);
>>> -        } else {
>>> -            rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>>> -        }
>>> -
>>> -        if (wait_for_invalidate(gt, rb))
>>> +        if (wait_for_invalidate(engine))
>>>               drm_err_ratelimited(&gt->i915->drm,
>>>                           "%s TLB invalidation did not complete in 
>>> %ums!\n",
>>>                           engine->name, TLB_INVAL_TIMEOUT_MS);
>>
>> With minor comments addressed:
>>
>> Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
> 
> I'll send v2 at some point, please stand by.
> 
> Regards,
> 
> Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 99c4b866addd..97cdd9853e38 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1143,6 +1143,84 @@  static int init_status_page(struct intel_engine_cs *engine)
 	return ret;
 }
 
+static bool intel_engine_init_tlb_invalidation(struct intel_engine_cs *engine)
+{
+	static const union intel_engine_tlb_inv_reg gen8_regs[] = {
+		[RENDER_CLASS].reg		= GEN8_RTCR,
+		[VIDEO_DECODE_CLASS].reg	= GEN8_M1TCR, /* , GEN8_M2TCR */
+		[VIDEO_ENHANCEMENT_CLASS].reg	= GEN8_VTCR,
+		[COPY_ENGINE_CLASS].reg		= GEN8_BTCR,
+	};
+	static const union intel_engine_tlb_inv_reg gen12_regs[] = {
+		[RENDER_CLASS].reg		= GEN12_GFX_TLB_INV_CR,
+		[VIDEO_DECODE_CLASS].reg	= GEN12_VD_TLB_INV_CR,
+		[VIDEO_ENHANCEMENT_CLASS].reg	= GEN12_VE_TLB_INV_CR,
+		[COPY_ENGINE_CLASS].reg		= GEN12_BLT_TLB_INV_CR,
+		[COMPUTE_CLASS].reg		= GEN12_COMPCTX_TLB_INV_CR,
+	};
+	static const union intel_engine_tlb_inv_reg xehp_regs[] = {
+		[RENDER_CLASS].mcr_reg		  = XEHP_GFX_TLB_INV_CR,
+		[VIDEO_DECODE_CLASS].mcr_reg	  = XEHP_VD_TLB_INV_CR,
+		[VIDEO_ENHANCEMENT_CLASS].mcr_reg = XEHP_VE_TLB_INV_CR,
+		[COPY_ENGINE_CLASS].mcr_reg	  = XEHP_BLT_TLB_INV_CR,
+		[COMPUTE_CLASS].mcr_reg		  = XEHP_COMPCTX_TLB_INV_CR,
+	};
+	struct drm_i915_private *i915 = engine->i915;
+	const union intel_engine_tlb_inv_reg *regs;
+	union intel_engine_tlb_inv_reg reg;
+	unsigned int class = engine->class;
+	unsigned int num = 0;
+	u32 val;
+
+	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
+		regs = xehp_regs;
+		num = ARRAY_SIZE(xehp_regs);
+	} else if (GRAPHICS_VER(i915) == 12) {
+		regs = gen12_regs;
+		num = ARRAY_SIZE(gen12_regs);
+	} else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
+		regs = gen8_regs;
+		num = ARRAY_SIZE(gen8_regs);
+	} else if (GRAPHICS_VER(i915) < 8) {
+		return false;
+	}
+
+	if (drm_WARN_ONCE(&i915->drm, !num,
+			  "Platform does not implement TLB invalidation!"))
+		return false;
+
+	if (drm_WARN_ON_ONCE(&i915->drm,
+			     class >= num ||
+			     (!regs[class].reg.reg &&
+			      !regs[class].mcr_reg.reg)))
+		return false;
+
+	reg = regs[class];
+
+	if (GRAPHICS_VER(i915) == 8 && class == VIDEO_DECODE_CLASS) {
+		reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
+		val = 0;
+	} else {
+		val = engine->instance;
+	}
+
+	val = BIT(val);
+
+	engine->tlb_inv.mcr = regs == xehp_regs;
+	engine->tlb_inv.reg = reg;
+	engine->tlb_inv.done = val;
+
+	if (GRAPHICS_VER(i915) >= 12 &&
+	    (engine->class == VIDEO_DECODE_CLASS ||
+	     engine->class == VIDEO_ENHANCEMENT_CLASS ||
+	     engine->class == COMPUTE_CLASS))
+		engine->tlb_inv.request = _MASKED_BIT_ENABLE(val);
+	else
+		engine->tlb_inv.request = val;
+
+	return true;
+}
+
 static int engine_setup_common(struct intel_engine_cs *engine)
 {
 	int err;
@@ -1182,6 +1260,9 @@  static int engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_whitelist(engine);
 	intel_engine_init_ctx_wa(engine);
 
+	if (intel_engine_init_tlb_invalidation(engine))
+		engine->flags |= I915_ENGINE_HAS_TLB_INVALIDATION;
+
 	if (GRAPHICS_VER(engine->i915) >= 12)
 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 4fd54fb8810f..8df4a09a6022 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -341,6 +341,19 @@  struct intel_engine_guc_stats {
 	u64 start_gt_clk;
 };
 
+union intel_engine_tlb_inv_reg {
+	i915_reg_t	reg;
+	i915_mcr_reg_t	mcr_reg;
+};
+
+struct intel_engine_tlb_inv
+{
+	bool mcr;
+	union intel_engine_tlb_inv_reg reg;
+	u32 request;
+	u32 done;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -372,6 +385,8 @@  struct intel_engine_cs {
 	u32 context_size;
 	u32 mmio_base;
 
+	struct intel_engine_tlb_inv tlb_inv;
+
 	/*
 	 * Some w/a require forcewake to be held (which prevents RC6) while
 	 * a particular engine is active. If so, we set fw_domain to which
@@ -556,6 +571,7 @@  struct intel_engine_cs {
 #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
 #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
 #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
+#define I915_ENGINE_HAS_TLB_INVALIDATION BIT(13)
 	unsigned int flags;
 
 	/*
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 7eeee5a7cb33..df7afff16fd6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -983,36 +983,6 @@  void intel_gt_info_print(const struct intel_gt_info *info,
 	intel_sseu_dump(&info->sseu, p);
 }
 
-struct reg_and_bit {
-	union {
-		i915_reg_t reg;
-		i915_mcr_reg_t mcr_reg;
-	};
-	u32 bit;
-};
-
-static struct reg_and_bit
-get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
-		const i915_reg_t *regs, const unsigned int num)
-{
-	const unsigned int class = engine->class;
-	struct reg_and_bit rb = { };
-
-	if (drm_WARN_ON_ONCE(&engine->i915->drm,
-			     class >= num || !regs[class].reg))
-		return rb;
-
-	rb.reg = regs[class];
-	if (gen8 && class == VIDEO_DECODE_CLASS)
-		rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
-	else
-		rb.bit = engine->instance;
-
-	rb.bit = BIT(rb.bit);
-
-	return rb;
-}
-
 /*
  * HW architecture suggest typical invalidation time at 40us,
  * with pessimistic cases up to 100us and a recommendation to
@@ -1026,14 +996,20 @@  get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
  * but are now considered MCR registers.  Since they exist within a GAM range,
  * the primary instance of the register rolls up the status from each unit.
  */
-static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
+static int wait_for_invalidate(struct intel_engine_cs *engine)
 {
-	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
-		return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
+	if (engine->tlb_inv.mcr)
+		return intel_gt_mcr_wait_for_reg(engine->gt,
+						 engine->tlb_inv.reg.mcr_reg,
+						 engine->tlb_inv.done,
+						 0,
 						 TLB_INVAL_TIMEOUT_US,
 						 TLB_INVAL_TIMEOUT_MS);
 	else
-		return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0,
+		return __intel_wait_for_register_fw(engine->gt->uncore,
+						    engine->tlb_inv.reg.reg,
+						    engine->tlb_inv.done,
+						    0,
 						    TLB_INVAL_TIMEOUT_US,
 						    TLB_INVAL_TIMEOUT_MS,
 						    NULL);
@@ -1041,50 +1017,14 @@  static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
 
 static void mmio_invalidate_full(struct intel_gt *gt)
 {
-	static const i915_reg_t gen8_regs[] = {
-		[RENDER_CLASS]			= GEN8_RTCR,
-		[VIDEO_DECODE_CLASS]		= GEN8_M1TCR, /* , GEN8_M2TCR */
-		[VIDEO_ENHANCEMENT_CLASS]	= GEN8_VTCR,
-		[COPY_ENGINE_CLASS]		= GEN8_BTCR,
-	};
-	static const i915_reg_t gen12_regs[] = {
-		[RENDER_CLASS]			= GEN12_GFX_TLB_INV_CR,
-		[VIDEO_DECODE_CLASS]		= GEN12_VD_TLB_INV_CR,
-		[VIDEO_ENHANCEMENT_CLASS]	= GEN12_VE_TLB_INV_CR,
-		[COPY_ENGINE_CLASS]		= GEN12_BLT_TLB_INV_CR,
-		[COMPUTE_CLASS]			= GEN12_COMPCTX_TLB_INV_CR,
-	};
-	static const i915_mcr_reg_t xehp_regs[] = {
-		[RENDER_CLASS]			= XEHP_GFX_TLB_INV_CR,
-		[VIDEO_DECODE_CLASS]		= XEHP_VD_TLB_INV_CR,
-		[VIDEO_ENHANCEMENT_CLASS]	= XEHP_VE_TLB_INV_CR,
-		[COPY_ENGINE_CLASS]		= XEHP_BLT_TLB_INV_CR,
-		[COMPUTE_CLASS]			= XEHP_COMPCTX_TLB_INV_CR,
-	};
 	struct drm_i915_private *i915 = gt->i915;
 	struct intel_uncore *uncore = gt->uncore;
 	struct intel_engine_cs *engine;
 	intel_engine_mask_t awake, tmp;
 	enum intel_engine_id id;
-	const i915_reg_t *regs;
-	unsigned int num = 0;
 	unsigned long flags;
 
-	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
-		regs = NULL;
-		num = ARRAY_SIZE(xehp_regs);
-	} else if (GRAPHICS_VER(i915) == 12) {
-		regs = gen12_regs;
-		num = ARRAY_SIZE(gen12_regs);
-	} else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
-		regs = gen8_regs;
-		num = ARRAY_SIZE(gen8_regs);
-	} else if (GRAPHICS_VER(i915) < 8) {
-		return;
-	}
-
-	if (drm_WARN_ONCE(&i915->drm, !num,
-			  "Platform does not implement TLB invalidation!"))
+	if (GRAPHICS_VER(i915) < 8)
 		return;
 
 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
@@ -1094,33 +1034,22 @@  static void mmio_invalidate_full(struct intel_gt *gt)
 
 	awake = 0;
 	for_each_engine(engine, gt, id) {
-		struct reg_and_bit rb;
-
 		if (!intel_engine_pm_is_awake(engine))
 			continue;
 
-		if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
-			u32 val = BIT(engine->instance);
+		if (drm_WARN_ON_ONCE(&i915->drm,
+				     !(engine->flags & I915_ENGINE_HAS_TLB_INVALIDATION)))
+			continue;
 
-			if (engine->class == VIDEO_DECODE_CLASS ||
-			    engine->class == VIDEO_ENHANCEMENT_CLASS ||
-			    engine->class == COMPUTE_CLASS)
-				val = _MASKED_BIT_ENABLE(val);
+		if (engine->tlb_inv.mcr)
 			intel_gt_mcr_multicast_write_fw(gt,
-							xehp_regs[engine->class],
-							val);
-		} else {
-			rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
-			if (!i915_mmio_reg_offset(rb.reg))
-				continue;
-
-			if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS ||
-			    engine->class == VIDEO_ENHANCEMENT_CLASS ||
-			    engine->class == COMPUTE_CLASS))
-				rb.bit = _MASKED_BIT_ENABLE(rb.bit);
-
-			intel_uncore_write_fw(uncore, rb.reg, rb.bit);
-		}
+							engine->tlb_inv.reg.mcr_reg,
+							engine->tlb_inv.request);
+		else
+			intel_uncore_write_fw(uncore,
+					      engine->tlb_inv.reg.reg,
+					      engine->tlb_inv.request);
+
 		awake |= engine->mask;
 	}
 
@@ -1139,16 +1068,7 @@  static void mmio_invalidate_full(struct intel_gt *gt)
 	intel_gt_mcr_unlock(gt, flags);
 
 	for_each_engine_masked(engine, gt, awake, tmp) {
-		struct reg_and_bit rb;
-
-		if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
-			rb.mcr_reg = xehp_regs[engine->class];
-			rb.bit = BIT(engine->instance);
-		} else {
-			rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
-		}
-
-		if (wait_for_invalidate(gt, rb))
+		if (wait_for_invalidate(engine))
 			drm_err_ratelimited(&gt->i915->drm,
 					    "%s TLB invalidation did not complete in %ums!\n",
 					    engine->name, TLB_INVAL_TIMEOUT_MS);