Message ID | 20221207173630.973662-2-andrzej.hajda@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] drm/i915: fix TLB invalidation for Gen12.50 video and compute engines | expand |
On 07/12/2022 17:36, Andrzej Hajda wrote: > Whole register/bit selection logic has been moved to separate helper. Why is missing. > Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com> > --- > drivers/gpu/drm/i915/gt/intel_gt.c | 136 +++++++++++------------------ > 1 file changed, 51 insertions(+), 85 deletions(-) Diffstat suggests because more streamlined code. Any other reason? > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c > index f0224b607aa4a7..05520ec3264db8 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct intel_gt_info *info, > intel_sseu_dump(&info->sseu, p); > } > > -struct reg_and_bit { > +struct reg_and_bits { > union { > i915_reg_t reg; > i915_mcr_reg_t mcr_reg; > }; > - u32 bit; > + u32 bits; > }; > > -static struct reg_and_bit > -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, > - const i915_reg_t *regs, const unsigned int num) > +static struct reg_and_bits > +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool write) > { > + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = { > + [RENDER_CLASS] = GEN8_RTCR, > + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, > + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, > + [COPY_ENGINE_CLASS] = GEN8_BTCR, > + }; > + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = { > + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, > + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, > + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, > + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, > + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, > + }; > + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = { > + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, > + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, > + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, > + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, > + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, > + }; > const unsigned int class = engine->class; > - struct reg_and_bit rb = { }; > + struct reg_and_bits rb = { .bits = BIT(engine->instance) }; > > - if (drm_WARN_ON_ONCE(&engine->i915->drm, > - class >= num || !regs[class].reg)) > + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) > + rb.mcr_reg = xehp_regs[class]; > + else if (GRAPHICS_VER(engine->i915) >= 12) > + rb.reg = gen12_regs[class]; > + else if (GRAPHICS_VER(engine->i915) >= 8) > + rb.reg = gen8_regs[class]; > + > + if (drm_WARN_ON_ONCE(&engine->i915->drm, !i915_mmio_reg_offset(rb.reg))) I'd prefer user readable message was kept but not a blocker. > return rb; > > - rb.reg = regs[class]; > - if (gen8 && class == VIDEO_DECODE_CLASS) > - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ > - else > - rb.bit = engine->instance; > + if (GRAPHICS_VER(engine->i915) < 12 && class == VIDEO_DECODE_CLASS) { > + rb.bits = 1; > + rb.reg.reg += 4 * engine->instance; No reason to drop the comment IMO. It explains things somewhat, or at least provides a hint. > + } > > - rb.bit = BIT(rb.bit); > + if (write && GRAPHICS_VER(engine->i915) >= 12 && > + (class == VIDEO_DECODE_CLASS || class == VIDEO_ENHANCEMENT_CLASS || > + class == COMPUTE_CLASS)) > + rb.bits = _MASKED_BIT_ENABLE(rb.bits); This could be else if to not have < 12 followed by explicit >= 12, but perhaps it is clearer like this, to signify it's two completely separate quirks. Also, I would perhaps consider having a local i915 since there's a good number of engine->i915, but it's up to you what looks nicer. > > return rb; > } > @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, > * but are now considered MCR registers. Since they exist within a GAM range, > * the primary instance of the register rolls up the status from each unit. > */ > -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) > +static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bits rb) > { > if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) > - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, > + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0, > TLB_INVAL_TIMEOUT_US, > TLB_INVAL_TIMEOUT_MS); > else > - return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0, > + return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bits, 0, > TLB_INVAL_TIMEOUT_US, > TLB_INVAL_TIMEOUT_MS, > NULL); > @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) > > static void mmio_invalidate_full(struct intel_gt *gt) > { > - static const i915_reg_t gen8_regs[] = { > - [RENDER_CLASS] = GEN8_RTCR, > - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ > - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, > - [COPY_ENGINE_CLASS] = GEN8_BTCR, > - }; > - static const i915_reg_t gen12_regs[] = { > - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, > - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, > - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, > - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, > - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, > - }; > - static const i915_mcr_reg_t xehp_regs[] = { > - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, > - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, > - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, > - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, > - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, > - }; > struct drm_i915_private *i915 = gt->i915; > struct intel_uncore *uncore = gt->uncore; > struct intel_engine_cs *engine; > intel_engine_mask_t awake, tmp; > enum intel_engine_id id; > - const i915_reg_t *regs; > - unsigned int num = 0; > unsigned long flags; > > - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - regs = NULL; > - num = ARRAY_SIZE(xehp_regs); > - } else if (GRAPHICS_VER(i915) == 12) { > - regs = gen12_regs; > - num = ARRAY_SIZE(gen12_regs); > - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { > - regs = gen8_regs; > - num = ARRAY_SIZE(gen8_regs); > - } else if (GRAPHICS_VER(i915) < 8) { > - return; > - } > - > - if (drm_WARN_ONCE(&i915->drm, !num, > - "Platform does not implement TLB invalidation!")) > + if (GRAPHICS_VER(i915) < 8) > return; > > intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); > @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct intel_gt *gt) > > awake = 0; > for_each_engine(engine, gt, id) { > - struct reg_and_bit rb; > + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, true); Ugh so actually what was a once per invalidation lookup is now repeated per engine, times two. I wonder if we can do this better. Lets think about it a bit. Regards, Tvrtko > > if (!intel_engine_pm_is_awake(engine)) > continue; > > - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - u32 val = BIT(engine->instance); > - > - if (engine->class == VIDEO_DECODE_CLASS || > - engine->class == VIDEO_ENHANCEMENT_CLASS || > - engine->class == COMPUTE_CLASS) > - val = _MASKED_BIT_ENABLE(val); > - intel_gt_mcr_multicast_write_fw(gt, > - xehp_regs[engine->class], > - val); > - } else { > - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); > - if (!i915_mmio_reg_offset(rb.reg)) > - continue; > - > - if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS || > - engine->class == VIDEO_ENHANCEMENT_CLASS || > - engine->class == COMPUTE_CLASS)) > - rb.bit = _MASKED_BIT_ENABLE(rb.bit); > - > - intel_uncore_write_fw(uncore, rb.reg, rb.bit); > - } > + if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) > + intel_gt_mcr_multicast_write_fw(gt, rb.mcr_reg, rb.bits); > + else > + intel_uncore_write_fw(uncore, rb.reg, rb.bits); > awake |= engine->mask; > } > > @@ -1159,14 +1132,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) > intel_gt_mcr_unlock(gt, flags); > > for_each_engine_masked(engine, gt, awake, tmp) { > - struct reg_and_bit rb; > - > - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - rb.mcr_reg = xehp_regs[engine->class]; > - rb.bit = BIT(engine->instance); > - } else { > - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); > - } > + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, false); > > if (wait_for_invalidate(gt, rb)) > drm_err_ratelimited(>->i915->drm,
On 09.12.2022 11:16, Tvrtko Ursulin wrote: > > On 07/12/2022 17:36, Andrzej Hajda wrote: >> Whole register/bit selection logic has been moved to separate helper. > > Why is missing. ...to clean up mmio_invalidate_full function. Will add. > >> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com> >> --- >> drivers/gpu/drm/i915/gt/intel_gt.c | 136 +++++++++++------------------ >> 1 file changed, 51 insertions(+), 85 deletions(-) > > Diffstat suggests because more streamlined code. Any other reason? > >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c >> b/drivers/gpu/drm/i915/gt/intel_gt.c >> index f0224b607aa4a7..05520ec3264db8 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gt.c >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c >> @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct >> intel_gt_info *info, >> intel_sseu_dump(&info->sseu, p); >> } >> -struct reg_and_bit { >> +struct reg_and_bits { >> union { >> i915_reg_t reg; >> i915_mcr_reg_t mcr_reg; >> }; >> - u32 bit; >> + u32 bits; >> }; >> -static struct reg_and_bit >> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, >> - const i915_reg_t *regs, const unsigned int num) >> +static struct reg_and_bits >> +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool >> write) >> { >> + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = { >> + [RENDER_CLASS] = GEN8_RTCR, >> + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, >> + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >> + [COPY_ENGINE_CLASS] = GEN8_BTCR, >> + }; >> + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = { >> + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >> + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >> + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >> + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >> + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >> + }; >> + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = { >> + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >> + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >> + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >> + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >> + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >> + }; >> const unsigned int class = engine->class; >> - struct reg_and_bit rb = { }; >> + struct reg_and_bits rb = { .bits = BIT(engine->instance) }; >> - if (drm_WARN_ON_ONCE(&engine->i915->drm, >> - class >= num || !regs[class].reg)) >> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) >> + rb.mcr_reg = xehp_regs[class]; >> + else if (GRAPHICS_VER(engine->i915) >= 12) >> + rb.reg = gen12_regs[class]; >> + else if (GRAPHICS_VER(engine->i915) >= 8) >> + rb.reg = gen8_regs[class]; >> + >> + if (drm_WARN_ON_ONCE(&engine->i915->drm, >> !i915_mmio_reg_offset(rb.reg))) > > I'd prefer user readable message was kept but not a blocker. Tried to avoid changes in refactoring, will change. > >> return rb; >> - rb.reg = regs[class]; >> - if (gen8 && class == VIDEO_DECODE_CLASS) >> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ >> - else >> - rb.bit = engine->instance; >> + if (GRAPHICS_VER(engine->i915) < 12 && class == >> VIDEO_DECODE_CLASS) { >> + rb.bits = 1; >> + rb.reg.reg += 4 * engine->instance; > > No reason to drop the comment IMO. It explains things somewhat, or at > least provides a hint. OK > >> + } >> - rb.bit = BIT(rb.bit); >> + if (write && GRAPHICS_VER(engine->i915) >= 12 && >> + (class == VIDEO_DECODE_CLASS || class == >> VIDEO_ENHANCEMENT_CLASS || >> + class == COMPUTE_CLASS)) >> + rb.bits = _MASKED_BIT_ENABLE(rb.bits); > > This could be else if to not have < 12 followed by explicit >= 12, but > perhaps it is clearer like this, to signify it's two completely > separate quirks. > > Also, I would perhaps consider having a local i915 since there's a > good number of engine->i915, but it's up to you what looks nicer. OK > >> return rb; >> } >> @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs >> *engine, const bool gen8, >> * but are now considered MCR registers. Since they exist within a >> GAM range, >> * the primary instance of the register rolls up the status from >> each unit. >> */ >> -static int wait_for_invalidate(struct intel_gt *gt, struct >> reg_and_bit rb) >> +static int wait_for_invalidate(struct intel_gt *gt, struct >> reg_and_bits rb) >> { >> if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) >> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, >> + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0, >> TLB_INVAL_TIMEOUT_US, >> TLB_INVAL_TIMEOUT_MS); >> else >> - return __intel_wait_for_register_fw(gt->uncore, rb.reg, >> rb.bit, 0, >> + return __intel_wait_for_register_fw(gt->uncore, rb.reg, >> rb.bits, 0, >> TLB_INVAL_TIMEOUT_US, >> TLB_INVAL_TIMEOUT_MS, >> NULL); >> @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct >> intel_gt *gt, struct reg_and_bit rb) >> static void mmio_invalidate_full(struct intel_gt *gt) >> { >> - static const i915_reg_t gen8_regs[] = { >> - [RENDER_CLASS] = GEN8_RTCR, >> - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ >> - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >> - [COPY_ENGINE_CLASS] = GEN8_BTCR, >> - }; >> - static const i915_reg_t gen12_regs[] = { >> - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >> - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >> - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >> - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >> - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >> - }; >> - static const i915_mcr_reg_t xehp_regs[] = { >> - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >> - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >> - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >> - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >> - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >> - }; >> struct drm_i915_private *i915 = gt->i915; >> struct intel_uncore *uncore = gt->uncore; >> struct intel_engine_cs *engine; >> intel_engine_mask_t awake, tmp; >> enum intel_engine_id id; >> - const i915_reg_t *regs; >> - unsigned int num = 0; >> unsigned long flags; >> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >> - regs = NULL; >> - num = ARRAY_SIZE(xehp_regs); >> - } else if (GRAPHICS_VER(i915) == 12) { >> - regs = gen12_regs; >> - num = ARRAY_SIZE(gen12_regs); >> - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { >> - regs = gen8_regs; >> - num = ARRAY_SIZE(gen8_regs); >> - } else if (GRAPHICS_VER(i915) < 8) { >> - return; >> - } >> - >> - if (drm_WARN_ONCE(&i915->drm, !num, >> - "Platform does not implement TLB invalidation!")) >> + if (GRAPHICS_VER(i915) < 8) >> return; >> intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); >> @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct >> intel_gt *gt) >> awake = 0; >> for_each_engine(engine, gt, id) { >> - struct reg_and_bit rb; >> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, >> true); > > Ugh so actually what was a once per invalidation lookup is now > repeated per engine, times two. I wonder if we can do this better. > Lets think about it a bit. It was always twice, see below. > > Regards, > > Tvrtko > >> if (!intel_engine_pm_is_awake(engine)) >> continue; >> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >> - u32 val = BIT(engine->instance); >> - >> - if (engine->class == VIDEO_DECODE_CLASS || >> - engine->class == VIDEO_ENHANCEMENT_CLASS || >> - engine->class == COMPUTE_CLASS) >> - val = _MASKED_BIT_ENABLE(val); >> - intel_gt_mcr_multicast_write_fw(gt, >> - xehp_regs[engine->class], >> - val); >> - } else { >> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); Here is the 2nd call, from old code. Since there are two separate loops there are two calls, caching call results would be overkill IMO. Or I can put back whole logic to mmio_invalidate_full, GEN12 quirk is needed only in 1st loop (write), the only redundancy will be with GEN8 quirk, which could be handled with some helper. Is it worth trying? I guess it is no big gain. Regards Andrzej >> - if (!i915_mmio_reg_offset(rb.reg)) >> - continue; >> - >> - if (GRAPHICS_VER(i915) == 12 && (engine->class == >> VIDEO_DECODE_CLASS || >> - engine->class == VIDEO_ENHANCEMENT_CLASS || >> - engine->class == COMPUTE_CLASS)) >> - rb.bit = _MASKED_BIT_ENABLE(rb.bit); >> - >> - intel_uncore_write_fw(uncore, rb.reg, rb.bit); >> - } >> + if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) >> + intel_gt_mcr_multicast_write_fw(gt, rb.mcr_reg, rb.bits); >> + else >> + intel_uncore_write_fw(uncore, rb.reg, rb.bits); >> awake |= engine->mask; >> } >> @@ -1159,14 +1132,7 @@ static void mmio_invalidate_full(struct >> intel_gt *gt) >> intel_gt_mcr_unlock(gt, flags); >> for_each_engine_masked(engine, gt, awake, tmp) { >> - struct reg_and_bit rb; >> - >> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >> - rb.mcr_reg = xehp_regs[engine->class]; >> - rb.bit = BIT(engine->instance); >> - } else { >> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); >> - } >> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, >> false); >> if (wait_for_invalidate(gt, rb)) >> drm_err_ratelimited(>->i915->drm,
On 09/12/2022 11:33, Andrzej Hajda wrote: > > > On 09.12.2022 11:16, Tvrtko Ursulin wrote: >> >> On 07/12/2022 17:36, Andrzej Hajda wrote: >>> Whole register/bit selection logic has been moved to separate helper. >> >> Why is missing. > > ...to clean up mmio_invalidate_full function. > > Will add. > >> >>> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com> >>> --- >>> drivers/gpu/drm/i915/gt/intel_gt.c | 136 +++++++++++------------------ >>> 1 file changed, 51 insertions(+), 85 deletions(-) >> >> Diffstat suggests because more streamlined code. Any other reason? >> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c >>> b/drivers/gpu/drm/i915/gt/intel_gt.c >>> index f0224b607aa4a7..05520ec3264db8 100644 >>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c >>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c >>> @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct >>> intel_gt_info *info, >>> intel_sseu_dump(&info->sseu, p); >>> } >>> -struct reg_and_bit { >>> +struct reg_and_bits { >>> union { >>> i915_reg_t reg; >>> i915_mcr_reg_t mcr_reg; >>> }; >>> - u32 bit; >>> + u32 bits; >>> }; >>> -static struct reg_and_bit >>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, >>> - const i915_reg_t *regs, const unsigned int num) >>> +static struct reg_and_bits >>> +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool >>> write) >>> { >>> + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = { >>> + [RENDER_CLASS] = GEN8_RTCR, >>> + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, >>> + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >>> + [COPY_ENGINE_CLASS] = GEN8_BTCR, >>> + }; >>> + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = { >>> + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >>> + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >>> + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >>> + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >>> + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >>> + }; >>> + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = { >>> + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >>> + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >>> + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >>> + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >>> + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >>> + }; >>> const unsigned int class = engine->class; >>> - struct reg_and_bit rb = { }; >>> + struct reg_and_bits rb = { .bits = BIT(engine->instance) }; >>> - if (drm_WARN_ON_ONCE(&engine->i915->drm, >>> - class >= num || !regs[class].reg)) >>> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) >>> + rb.mcr_reg = xehp_regs[class]; >>> + else if (GRAPHICS_VER(engine->i915) >= 12) >>> + rb.reg = gen12_regs[class]; >>> + else if (GRAPHICS_VER(engine->i915) >= 8) >>> + rb.reg = gen8_regs[class]; >>> + >>> + if (drm_WARN_ON_ONCE(&engine->i915->drm, >>> !i915_mmio_reg_offset(rb.reg))) >> >> I'd prefer user readable message was kept but not a blocker. > > Tried to avoid changes in refactoring, will change. > >> >>> return rb; >>> - rb.reg = regs[class]; >>> - if (gen8 && class == VIDEO_DECODE_CLASS) >>> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ >>> - else >>> - rb.bit = engine->instance; >>> + if (GRAPHICS_VER(engine->i915) < 12 && class == >>> VIDEO_DECODE_CLASS) { >>> + rb.bits = 1; >>> + rb.reg.reg += 4 * engine->instance; >> >> No reason to drop the comment IMO. It explains things somewhat, or at >> least provides a hint. > > OK > >> >>> + } >>> - rb.bit = BIT(rb.bit); >>> + if (write && GRAPHICS_VER(engine->i915) >= 12 && >>> + (class == VIDEO_DECODE_CLASS || class == >>> VIDEO_ENHANCEMENT_CLASS || >>> + class == COMPUTE_CLASS)) >>> + rb.bits = _MASKED_BIT_ENABLE(rb.bits); >> >> This could be else if to not have < 12 followed by explicit >= 12, but >> perhaps it is clearer like this, to signify it's two completely >> separate quirks. >> >> Also, I would perhaps consider having a local i915 since there's a >> good number of engine->i915, but it's up to you what looks nicer. > > OK > >> >>> return rb; >>> } >>> @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs >>> *engine, const bool gen8, >>> * but are now considered MCR registers. Since they exist within a >>> GAM range, >>> * the primary instance of the register rolls up the status from >>> each unit. >>> */ >>> -static int wait_for_invalidate(struct intel_gt *gt, struct >>> reg_and_bit rb) >>> +static int wait_for_invalidate(struct intel_gt *gt, struct >>> reg_and_bits rb) >>> { >>> if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) >>> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, >>> + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0, >>> TLB_INVAL_TIMEOUT_US, >>> TLB_INVAL_TIMEOUT_MS); >>> else >>> - return __intel_wait_for_register_fw(gt->uncore, rb.reg, >>> rb.bit, 0, >>> + return __intel_wait_for_register_fw(gt->uncore, rb.reg, >>> rb.bits, 0, >>> TLB_INVAL_TIMEOUT_US, >>> TLB_INVAL_TIMEOUT_MS, >>> NULL); >>> @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct >>> intel_gt *gt, struct reg_and_bit rb) >>> static void mmio_invalidate_full(struct intel_gt *gt) >>> { >>> - static const i915_reg_t gen8_regs[] = { >>> - [RENDER_CLASS] = GEN8_RTCR, >>> - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ >>> - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >>> - [COPY_ENGINE_CLASS] = GEN8_BTCR, >>> - }; >>> - static const i915_reg_t gen12_regs[] = { >>> - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >>> - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >>> - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >>> - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >>> - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >>> - }; >>> - static const i915_mcr_reg_t xehp_regs[] = { >>> - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >>> - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >>> - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >>> - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >>> - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >>> - }; >>> struct drm_i915_private *i915 = gt->i915; >>> struct intel_uncore *uncore = gt->uncore; >>> struct intel_engine_cs *engine; >>> intel_engine_mask_t awake, tmp; >>> enum intel_engine_id id; >>> - const i915_reg_t *regs; >>> - unsigned int num = 0; >>> unsigned long flags; >>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >>> - regs = NULL; >>> - num = ARRAY_SIZE(xehp_regs); >>> - } else if (GRAPHICS_VER(i915) == 12) { >>> - regs = gen12_regs; >>> - num = ARRAY_SIZE(gen12_regs); >>> - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { >>> - regs = gen8_regs; >>> - num = ARRAY_SIZE(gen8_regs); >>> - } else if (GRAPHICS_VER(i915) < 8) { >>> - return; >>> - } >>> - >>> - if (drm_WARN_ONCE(&i915->drm, !num, >>> - "Platform does not implement TLB invalidation!")) >>> + if (GRAPHICS_VER(i915) < 8) >>> return; >>> intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); >>> @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct >>> intel_gt *gt) >>> awake = 0; >>> for_each_engine(engine, gt, id) { >>> - struct reg_and_bit rb; >>> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, >>> true); >> >> Ugh so actually what was a once per invalidation lookup is now >> repeated per engine, times two. I wonder if we can do this better. >> Lets think about it a bit. > > It was always twice, see below. > >> >> Regards, >> >> Tvrtko >> >>> if (!intel_engine_pm_is_awake(engine)) >>> continue; >>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >>> - u32 val = BIT(engine->instance); >>> - >>> - if (engine->class == VIDEO_DECODE_CLASS || >>> - engine->class == VIDEO_ENHANCEMENT_CLASS || >>> - engine->class == COMPUTE_CLASS) >>> - val = _MASKED_BIT_ENABLE(val); >>> - intel_gt_mcr_multicast_write_fw(gt, >>> - xehp_regs[engine->class], >>> - val); >>> - } else { >>> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); > > Here is the 2nd call, from old code. > Since there are two separate loops there are two calls, caching call > results would be overkill IMO. > Or I can put back whole logic to mmio_invalidate_full, GEN12 quirk is > needed only in 1st loop (write), the only redundancy will be with GEN8 > quirk, which could be handled with some helper. > Is it worth trying? I guess it is no big gain. Yes it was always twice in get_reg_and_bit but not the whole register table selection. We have some checkes which are per platform, and some which are platform and engine. I propose to keep them split. I made a stab at it like this: diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 0377e1b25be9..d907b9005dd6 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -988,33 +988,50 @@ void intel_gt_info_print(const struct intel_gt_info *info, intel_sseu_dump(&info->sseu, p); } -struct reg_and_bit { +struct inv_reg { union { i915_reg_t reg; i915_mcr_reg_t mcr_reg; }; +}; + +struct reg_and_bit { + struct inv_reg reg; u32 bit; }; static struct reg_and_bit -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, - const i915_reg_t *regs, const unsigned int num) +get_reg_and_bit(const struct intel_engine_cs *engine, + const i915_reg_t *regs, const unsigned int num, + bool write) { + struct drm_i915_private *i915 = engine->i915; const unsigned int class = engine->class; struct reg_and_bit rb = { }; + BUILD_BUG_ON(sizeof(rb.reg.reg) != sizeof(rb.reg.mcr_reg)); + BUILD_BUG_ON(!__builtin_types_compatible_p(typeof(rb.reg.reg.reg), + typeof(rb.reg.mcr_reg.reg))); + if (drm_WARN_ON_ONCE(&engine->i915->drm, class >= num || !regs[class].reg)) return rb; - rb.reg = regs[class]; - if (gen8 && class == VIDEO_DECODE_CLASS) - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ + rb.reg.reg = regs[class]; + + if (GRAPHICS_VER(i915) < 12 && class == VIDEO_DECODE_CLASS) + rb.reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ else rb.bit = engine->instance; rb.bit = BIT(rb.bit); + if (write && GRAPHICS_VER(i915) >= 12 && + (engine->class == VIDEO_DECODE_CLASS || + engine->class == VIDEO_ENHANCEMENT_CLASS || + engine->class == COMPUTE_CLASS)) + rb.bit = _MASKED_BIT_ENABLE(rb.bit); + return rb; } @@ -1031,14 +1048,16 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, * but are now considered MCR registers. Since they exist within a GAM range, * the primary instance of the register rolls up the status from each unit. */ -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) +static int +wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb, bool mcr) { - if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, + if (mcr) + return intel_gt_mcr_wait_for_reg(gt, rb.reg.mcr_reg, rb.bit, 0, TLB_INVAL_TIMEOUT_US, TLB_INVAL_TIMEOUT_MS); else - return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0, + return __intel_wait_for_register_fw(gt->uncore, + rb.reg.reg, rb.bit, 0, TLB_INVAL_TIMEOUT_US, TLB_INVAL_TIMEOUT_MS, NULL); @@ -1068,6 +1087,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) }; struct drm_i915_private *i915 = gt->i915; struct intel_uncore *uncore = gt->uncore; + const bool mcr = GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50); struct intel_engine_cs *engine; intel_engine_mask_t awake, tmp; enum intel_engine_id id; @@ -1076,7 +1096,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) unsigned long flags; if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - regs = NULL; + regs = (i915_reg_t *)xehp_regs; num = ARRAY_SIZE(xehp_regs); } else if (GRAPHICS_VER(i915) == 12) { regs = gen12_regs; @@ -1104,28 +1124,15 @@ static void mmio_invalidate_full(struct intel_gt *gt) if (!intel_engine_pm_is_awake(engine)) continue; - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - u32 val = BIT(engine->instance); - - if (engine->class == VIDEO_DECODE_CLASS || - engine->class == VIDEO_ENHANCEMENT_CLASS || - engine->class == COMPUTE_CLASS) - val = _MASKED_BIT_ENABLE(val); - intel_gt_mcr_multicast_write_fw(gt, - xehp_regs[engine->class], - val); - } else { - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); - if (!i915_mmio_reg_offset(rb.reg)) - continue; - - if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS || - engine->class == VIDEO_ENHANCEMENT_CLASS || - engine->class == COMPUTE_CLASS)) - rb.bit = _MASKED_BIT_ENABLE(rb.bit); - - intel_uncore_write_fw(uncore, rb.reg, rb.bit); - } + rb = get_reg_and_bit(engine, regs, num, true); + if (!i915_mmio_reg_offset(rb.reg.reg)) + continue; + + if (mcr) + intel_gt_mcr_multicast_write_fw(gt, rb.reg.mcr_reg, + rb.bit); + else + intel_uncore_write_fw(uncore, rb.reg.reg, rb.bit); awake |= engine->mask; } @@ -1144,16 +1151,10 @@ static void mmio_invalidate_full(struct intel_gt *gt) intel_gt_mcr_unlock(gt, flags); for_each_engine_masked(engine, gt, awake, tmp) { - struct reg_and_bit rb; - - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - rb.mcr_reg = xehp_regs[engine->class]; - rb.bit = BIT(engine->instance); - } else { - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); - } + struct reg_and_bit rb = + get_reg_and_bit(engine, regs, num, false); - if (wait_for_invalidate(gt, rb)) + if (wait_for_invalidate(gt, rb, mcr)) drm_err_ratelimited(>->i915->drm, "%s TLB invalidation did not complete in %ums!\n", engine->name, TLB_INVAL_TIMEOUT_MS); So only questions which vary per engine are asked in the engine loops. A bit hacky with asserts i915_reg_t and i915_mcr_reg_t are the same underlying type really but may be passable. See what you think. Regards, Tvrtko
On 09/12/2022 12:04, Tvrtko Ursulin wrote: > > On 09/12/2022 11:33, Andrzej Hajda wrote: >> >> >> On 09.12.2022 11:16, Tvrtko Ursulin wrote: >>> >>> On 07/12/2022 17:36, Andrzej Hajda wrote: >>>> Whole register/bit selection logic has been moved to separate helper. >>> >>> Why is missing. >> >> ...to clean up mmio_invalidate_full function. >> >> Will add. >> >>> >>>> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com> >>>> --- >>>> drivers/gpu/drm/i915/gt/intel_gt.c | 136 >>>> +++++++++++------------------ >>>> 1 file changed, 51 insertions(+), 85 deletions(-) >>> >>> Diffstat suggests because more streamlined code. Any other reason? >>> >>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c >>>> b/drivers/gpu/drm/i915/gt/intel_gt.c >>>> index f0224b607aa4a7..05520ec3264db8 100644 >>>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c >>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c >>>> @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct >>>> intel_gt_info *info, >>>> intel_sseu_dump(&info->sseu, p); >>>> } >>>> -struct reg_and_bit { >>>> +struct reg_and_bits { >>>> union { >>>> i915_reg_t reg; >>>> i915_mcr_reg_t mcr_reg; >>>> }; >>>> - u32 bit; >>>> + u32 bits; >>>> }; >>>> -static struct reg_and_bit >>>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, >>>> - const i915_reg_t *regs, const unsigned int num) >>>> +static struct reg_and_bits >>>> +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool >>>> write) >>>> { >>>> + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = { >>>> + [RENDER_CLASS] = GEN8_RTCR, >>>> + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, >>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >>>> + [COPY_ENGINE_CLASS] = GEN8_BTCR, >>>> + }; >>>> + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = { >>>> + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >>>> + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >>>> + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >>>> + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >>>> + }; >>>> + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = { >>>> + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >>>> + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >>>> + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >>>> + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >>>> + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >>>> + }; >>>> const unsigned int class = engine->class; >>>> - struct reg_and_bit rb = { }; >>>> + struct reg_and_bits rb = { .bits = BIT(engine->instance) }; >>>> - if (drm_WARN_ON_ONCE(&engine->i915->drm, >>>> - class >= num || !regs[class].reg)) >>>> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) >>>> + rb.mcr_reg = xehp_regs[class]; >>>> + else if (GRAPHICS_VER(engine->i915) >= 12) >>>> + rb.reg = gen12_regs[class]; >>>> + else if (GRAPHICS_VER(engine->i915) >= 8) >>>> + rb.reg = gen8_regs[class]; >>>> + >>>> + if (drm_WARN_ON_ONCE(&engine->i915->drm, >>>> !i915_mmio_reg_offset(rb.reg))) >>> >>> I'd prefer user readable message was kept but not a blocker. >> >> Tried to avoid changes in refactoring, will change. >> >>> >>>> return rb; >>>> - rb.reg = regs[class]; >>>> - if (gen8 && class == VIDEO_DECODE_CLASS) >>>> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ >>>> - else >>>> - rb.bit = engine->instance; >>>> + if (GRAPHICS_VER(engine->i915) < 12 && class == >>>> VIDEO_DECODE_CLASS) { >>>> + rb.bits = 1; >>>> + rb.reg.reg += 4 * engine->instance; >>> >>> No reason to drop the comment IMO. It explains things somewhat, or at >>> least provides a hint. >> >> OK >> >>> >>>> + } >>>> - rb.bit = BIT(rb.bit); >>>> + if (write && GRAPHICS_VER(engine->i915) >= 12 && >>>> + (class == VIDEO_DECODE_CLASS || class == >>>> VIDEO_ENHANCEMENT_CLASS || >>>> + class == COMPUTE_CLASS)) >>>> + rb.bits = _MASKED_BIT_ENABLE(rb.bits); >>> >>> This could be else if to not have < 12 followed by explicit >= 12, >>> but perhaps it is clearer like this, to signify it's two completely >>> separate quirks. >>> >>> Also, I would perhaps consider having a local i915 since there's a >>> good number of engine->i915, but it's up to you what looks nicer. >> >> OK >> >>> >>>> return rb; >>>> } >>>> @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs >>>> *engine, const bool gen8, >>>> * but are now considered MCR registers. Since they exist within >>>> a GAM range, >>>> * the primary instance of the register rolls up the status from >>>> each unit. >>>> */ >>>> -static int wait_for_invalidate(struct intel_gt *gt, struct >>>> reg_and_bit rb) >>>> +static int wait_for_invalidate(struct intel_gt *gt, struct >>>> reg_and_bits rb) >>>> { >>>> if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) >>>> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, >>>> + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0, >>>> TLB_INVAL_TIMEOUT_US, >>>> TLB_INVAL_TIMEOUT_MS); >>>> else >>>> - return __intel_wait_for_register_fw(gt->uncore, rb.reg, >>>> rb.bit, 0, >>>> + return __intel_wait_for_register_fw(gt->uncore, rb.reg, >>>> rb.bits, 0, >>>> TLB_INVAL_TIMEOUT_US, >>>> TLB_INVAL_TIMEOUT_MS, >>>> NULL); >>>> @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct >>>> intel_gt *gt, struct reg_and_bit rb) >>>> static void mmio_invalidate_full(struct intel_gt *gt) >>>> { >>>> - static const i915_reg_t gen8_regs[] = { >>>> - [RENDER_CLASS] = GEN8_RTCR, >>>> - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ >>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, >>>> - [COPY_ENGINE_CLASS] = GEN8_BTCR, >>>> - }; >>>> - static const i915_reg_t gen12_regs[] = { >>>> - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, >>>> - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, >>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, >>>> - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, >>>> - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, >>>> - }; >>>> - static const i915_mcr_reg_t xehp_regs[] = { >>>> - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, >>>> - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, >>>> - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, >>>> - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, >>>> - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, >>>> - }; >>>> struct drm_i915_private *i915 = gt->i915; >>>> struct intel_uncore *uncore = gt->uncore; >>>> struct intel_engine_cs *engine; >>>> intel_engine_mask_t awake, tmp; >>>> enum intel_engine_id id; >>>> - const i915_reg_t *regs; >>>> - unsigned int num = 0; >>>> unsigned long flags; >>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >>>> - regs = NULL; >>>> - num = ARRAY_SIZE(xehp_regs); >>>> - } else if (GRAPHICS_VER(i915) == 12) { >>>> - regs = gen12_regs; >>>> - num = ARRAY_SIZE(gen12_regs); >>>> - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { >>>> - regs = gen8_regs; >>>> - num = ARRAY_SIZE(gen8_regs); >>>> - } else if (GRAPHICS_VER(i915) < 8) { >>>> - return; >>>> - } >>>> - >>>> - if (drm_WARN_ONCE(&i915->drm, !num, >>>> - "Platform does not implement TLB invalidation!")) >>>> + if (GRAPHICS_VER(i915) < 8) >>>> return; >>>> intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); >>>> @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct >>>> intel_gt *gt) >>>> awake = 0; >>>> for_each_engine(engine, gt, id) { >>>> - struct reg_and_bit rb; >>>> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, >>>> true); >>> >>> Ugh so actually what was a once per invalidation lookup is now >>> repeated per engine, times two. I wonder if we can do this better. >>> Lets think about it a bit. >> >> It was always twice, see below. >> >>> >>> Regards, >>> >>> Tvrtko >>> >>>> if (!intel_engine_pm_is_awake(engine)) >>>> continue; >>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { >>>> - u32 val = BIT(engine->instance); >>>> - >>>> - if (engine->class == VIDEO_DECODE_CLASS || >>>> - engine->class == VIDEO_ENHANCEMENT_CLASS || >>>> - engine->class == COMPUTE_CLASS) >>>> - val = _MASKED_BIT_ENABLE(val); >>>> - intel_gt_mcr_multicast_write_fw(gt, >>>> - xehp_regs[engine->class], >>>> - val); >>>> - } else { >>>> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, >>>> num); >> >> Here is the 2nd call, from old code. >> Since there are two separate loops there are two calls, caching call >> results would be overkill IMO. >> Or I can put back whole logic to mmio_invalidate_full, GEN12 quirk is >> needed only in 1st loop (write), the only redundancy will be with GEN8 >> quirk, which could be handled with some helper. >> Is it worth trying? I guess it is no big gain. > > Yes it was always twice in get_reg_and_bit but not the whole register > table selection. > > We have some checkes which are per platform, and some which are platform > and engine. I propose to keep them split. I made a stab at it like this: > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c > b/drivers/gpu/drm/i915/gt/intel_gt.c > index 0377e1b25be9..d907b9005dd6 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > @@ -988,33 +988,50 @@ void intel_gt_info_print(const struct > intel_gt_info *info, > intel_sseu_dump(&info->sseu, p); > } > > -struct reg_and_bit { > +struct inv_reg { > union { > i915_reg_t reg; > i915_mcr_reg_t mcr_reg; > }; > +}; > + > +struct reg_and_bit { > + struct inv_reg reg; > u32 bit; > }; > > static struct reg_and_bit > -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, > - const i915_reg_t *regs, const unsigned int num) > +get_reg_and_bit(const struct intel_engine_cs *engine, > + const i915_reg_t *regs, const unsigned int num, > + bool write) > { > + struct drm_i915_private *i915 = engine->i915; > const unsigned int class = engine->class; > struct reg_and_bit rb = { }; > > + BUILD_BUG_ON(sizeof(rb.reg.reg) != sizeof(rb.reg.mcr_reg)); > + BUILD_BUG_ON(!__builtin_types_compatible_p(typeof(rb.reg.reg.reg), > + > typeof(rb.reg.mcr_reg.reg))); > + > if (drm_WARN_ON_ONCE(&engine->i915->drm, > class >= num || !regs[class].reg)) > return rb; > > - rb.reg = regs[class]; > - if (gen8 && class == VIDEO_DECODE_CLASS) > - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ > + rb.reg.reg = regs[class]; > + > + if (GRAPHICS_VER(i915) < 12 && class == VIDEO_DECODE_CLASS) > + rb.reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ > else > rb.bit = engine->instance; > > rb.bit = BIT(rb.bit); > > + if (write && GRAPHICS_VER(i915) >= 12 && > + (engine->class == VIDEO_DECODE_CLASS || > + engine->class == VIDEO_ENHANCEMENT_CLASS || > + engine->class == COMPUTE_CLASS)) > + rb.bit = _MASKED_BIT_ENABLE(rb.bit); > + > return rb; > } > > @@ -1031,14 +1048,16 @@ get_reg_and_bit(const struct intel_engine_cs > *engine, const bool gen8, > * but are now considered MCR registers. Since they exist within a > GAM range, > * the primary instance of the register rolls up the status from each > unit. > */ > -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) > +static int > +wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb, bool mcr) > { > - if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) > - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, > + if (mcr) > + return intel_gt_mcr_wait_for_reg(gt, rb.reg.mcr_reg, > rb.bit, 0, > TLB_INVAL_TIMEOUT_US, > TLB_INVAL_TIMEOUT_MS); > else > - return __intel_wait_for_register_fw(gt->uncore, rb.reg, > rb.bit, 0, > + return __intel_wait_for_register_fw(gt->uncore, > + rb.reg.reg, rb.bit, 0, > TLB_INVAL_TIMEOUT_US, > TLB_INVAL_TIMEOUT_MS, > NULL); > @@ -1068,6 +1087,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) > }; > struct drm_i915_private *i915 = gt->i915; > struct intel_uncore *uncore = gt->uncore; > + const bool mcr = GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50); > struct intel_engine_cs *engine; > intel_engine_mask_t awake, tmp; > enum intel_engine_id id; > @@ -1076,7 +1096,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) > unsigned long flags; > > if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - regs = NULL; > + regs = (i915_reg_t *)xehp_regs; > num = ARRAY_SIZE(xehp_regs); > } else if (GRAPHICS_VER(i915) == 12) { > regs = gen12_regs; > @@ -1104,28 +1124,15 @@ static void mmio_invalidate_full(struct intel_gt > *gt) > if (!intel_engine_pm_is_awake(engine)) > continue; > > - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - u32 val = BIT(engine->instance); > - > - if (engine->class == VIDEO_DECODE_CLASS || > - engine->class == VIDEO_ENHANCEMENT_CLASS || > - engine->class == COMPUTE_CLASS) > - val = _MASKED_BIT_ENABLE(val); > - intel_gt_mcr_multicast_write_fw(gt, > - > xehp_regs[engine->class], > - val); > - } else { > - rb = get_reg_and_bit(engine, regs == gen8_regs, > regs, num); > - if (!i915_mmio_reg_offset(rb.reg)) > - continue; > - > - if (GRAPHICS_VER(i915) == 12 && (engine->class > == VIDEO_DECODE_CLASS || > - engine->class == VIDEO_ENHANCEMENT_CLASS || > - engine->class == COMPUTE_CLASS)) > - rb.bit = _MASKED_BIT_ENABLE(rb.bit); > - > - intel_uncore_write_fw(uncore, rb.reg, rb.bit); > - } > + rb = get_reg_and_bit(engine, regs, num, true); > + if (!i915_mmio_reg_offset(rb.reg.reg)) > + continue; > + > + if (mcr) > + intel_gt_mcr_multicast_write_fw(gt, rb.reg.mcr_reg, > + rb.bit); > + else > + intel_uncore_write_fw(uncore, rb.reg.reg, rb.bit); > awake |= engine->mask; > } > > @@ -1144,16 +1151,10 @@ static void mmio_invalidate_full(struct intel_gt > *gt) > intel_gt_mcr_unlock(gt, flags); > > for_each_engine_masked(engine, gt, awake, tmp) { > - struct reg_and_bit rb; > - > - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { > - rb.mcr_reg = xehp_regs[engine->class]; > - rb.bit = BIT(engine->instance); > - } else { > - rb = get_reg_and_bit(engine, regs == gen8_regs, > regs, num); > - } > + struct reg_and_bit rb = > + get_reg_and_bit(engine, regs, num, false); > > - if (wait_for_invalidate(gt, rb)) > + if (wait_for_invalidate(gt, rb, mcr)) > drm_err_ratelimited(>->i915->drm, > "%s TLB invalidation did > not complete in %ums!\n", > engine->name, > TLB_INVAL_TIMEOUT_MS); > > So only questions which vary per engine are asked in the engine loops. > > A bit hacky with asserts i915_reg_t and i915_mcr_reg_t are the same > underlying type really but may be passable. See what you think. Or even store register and values (write/read) in struct intel_engine_cs at engine init time? Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index f0224b607aa4a7..05520ec3264db8 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct intel_gt_info *info, intel_sseu_dump(&info->sseu, p); } -struct reg_and_bit { +struct reg_and_bits { union { i915_reg_t reg; i915_mcr_reg_t mcr_reg; }; - u32 bit; + u32 bits; }; -static struct reg_and_bit -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, - const i915_reg_t *regs, const unsigned int num) +static struct reg_and_bits +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool write) { + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = { + [RENDER_CLASS] = GEN8_RTCR, + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, + [COPY_ENGINE_CLASS] = GEN8_BTCR, + }; + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = { + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, + }; + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = { + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, + }; const unsigned int class = engine->class; - struct reg_and_bit rb = { }; + struct reg_and_bits rb = { .bits = BIT(engine->instance) }; - if (drm_WARN_ON_ONCE(&engine->i915->drm, - class >= num || !regs[class].reg)) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) + rb.mcr_reg = xehp_regs[class]; + else if (GRAPHICS_VER(engine->i915) >= 12) + rb.reg = gen12_regs[class]; + else if (GRAPHICS_VER(engine->i915) >= 8) + rb.reg = gen8_regs[class]; + + if (drm_WARN_ON_ONCE(&engine->i915->drm, !i915_mmio_reg_offset(rb.reg))) return rb; - rb.reg = regs[class]; - if (gen8 && class == VIDEO_DECODE_CLASS) - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ - else - rb.bit = engine->instance; + if (GRAPHICS_VER(engine->i915) < 12 && class == VIDEO_DECODE_CLASS) { + rb.bits = 1; + rb.reg.reg += 4 * engine->instance; + } - rb.bit = BIT(rb.bit); + if (write && GRAPHICS_VER(engine->i915) >= 12 && + (class == VIDEO_DECODE_CLASS || class == VIDEO_ENHANCEMENT_CLASS || + class == COMPUTE_CLASS)) + rb.bits = _MASKED_BIT_ENABLE(rb.bits); return rb; } @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, * but are now considered MCR registers. Since they exist within a GAM range, * the primary instance of the register rolls up the status from each unit. */ -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) +static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bits rb) { if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50)) - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0, + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0, TLB_INVAL_TIMEOUT_US, TLB_INVAL_TIMEOUT_MS); else - return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0, + return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bits, 0, TLB_INVAL_TIMEOUT_US, TLB_INVAL_TIMEOUT_MS, NULL); @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb) static void mmio_invalidate_full(struct intel_gt *gt) { - static const i915_reg_t gen8_regs[] = { - [RENDER_CLASS] = GEN8_RTCR, - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, - [COPY_ENGINE_CLASS] = GEN8_BTCR, - }; - static const i915_reg_t gen12_regs[] = { - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR, - }; - static const i915_mcr_reg_t xehp_regs[] = { - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR, - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR, - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR, - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR, - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR, - }; struct drm_i915_private *i915 = gt->i915; struct intel_uncore *uncore = gt->uncore; struct intel_engine_cs *engine; intel_engine_mask_t awake, tmp; enum intel_engine_id id; - const i915_reg_t *regs; - unsigned int num = 0; unsigned long flags; - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - regs = NULL; - num = ARRAY_SIZE(xehp_regs); - } else if (GRAPHICS_VER(i915) == 12) { - regs = gen12_regs; - num = ARRAY_SIZE(gen12_regs); - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { - regs = gen8_regs; - num = ARRAY_SIZE(gen8_regs); - } else if (GRAPHICS_VER(i915) < 8) { - return; - } - - if (drm_WARN_ONCE(&i915->drm, !num, - "Platform does not implement TLB invalidation!")) + if (GRAPHICS_VER(i915) < 8) return; intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct intel_gt *gt) awake = 0; for_each_engine(engine, gt, id) { - struct reg_and_bit rb; + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, true); if (!intel_engine_pm_is_awake(engine)) continue; - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - u32 val = BIT(engine->instance); - - if (engine->class == VIDEO_DECODE_CLASS || - engine->class == VIDEO_ENHANCEMENT_CLASS || - engine->class == COMPUTE_CLASS) - val = _MASKED_BIT_ENABLE(val); - intel_gt_mcr_multicast_write_fw(gt, - xehp_regs[engine->class], - val); - } else { - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); - if (!i915_mmio_reg_offset(rb.reg)) - continue; - - if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS || - engine->class == VIDEO_ENHANCEMENT_CLASS || - engine->class == COMPUTE_CLASS)) - rb.bit = _MASKED_BIT_ENABLE(rb.bit); - - intel_uncore_write_fw(uncore, rb.reg, rb.bit); - } + if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) + intel_gt_mcr_multicast_write_fw(gt, rb.mcr_reg, rb.bits); + else + intel_uncore_write_fw(uncore, rb.reg, rb.bits); awake |= engine->mask; } @@ -1159,14 +1132,7 @@ static void mmio_invalidate_full(struct intel_gt *gt) intel_gt_mcr_unlock(gt, flags); for_each_engine_masked(engine, gt, awake, tmp) { - struct reg_and_bit rb; - - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { - rb.mcr_reg = xehp_regs[engine->class]; - rb.bit = BIT(engine->instance); - } else { - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); - } + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine, false); if (wait_for_invalidate(gt, rb)) drm_err_ratelimited(>->i915->drm,
Whole register/bit selection logic has been moved to separate helper. Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com> --- drivers/gpu/drm/i915/gt/intel_gt.c | 136 +++++++++++------------------ 1 file changed, 51 insertions(+), 85 deletions(-)