diff mbox series

[v2] drm/i915: Make IRQ reset and postinstall multi-gt aware

Message ID 20230413092006.931861-1-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915: Make IRQ reset and postinstall multi-gt aware | expand

Commit Message

Andi Shyti April 13, 2023, 9:20 a.m. UTC
From: Paulo Zanoni <paulo.r.zanoni@intel.com>

In multitile systems IRQ need to be reset and enabled per GT.

Although in MTL the GUnit misc interrupts register set are
available only in GT-0, we need to loop through all the GT's
in order to initialize the media engine which lies on a different
GT.

Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
---
Hi,

proposing again this patch, apparently GuC needs this patch to
initialize the media GT.

Andi

Changelog
=========
v1 -> v2
 - improve description in the commit log.

 drivers/gpu/drm/i915/i915_irq.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

Comments

Tvrtko Ursulin April 13, 2023, 10:41 a.m. UTC | #1
On 13/04/2023 10:20, Andi Shyti wrote:
> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> 
> In multitile systems IRQ need to be reset and enabled per GT.
> 
> Although in MTL the GUnit misc interrupts register set are
> available only in GT-0, we need to loop through all the GT's
> in order to initialize the media engine which lies on a different
> GT.
> 
> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> ---
> Hi,
> 
> proposing again this patch, apparently GuC needs this patch to
> initialize the media GT.

What is the resolution for Matt's concern that this is wrong for MTL?

Regards,

Tvrtko

> Changelog
> =========
> v1 -> v2
>   - improve description in the commit log.
> 
>   drivers/gpu/drm/i915/i915_irq.c | 28 ++++++++++++++++++----------
>   1 file changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d24bdea65a3dc..524d64bf5d186 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2764,14 +2764,19 @@ static void dg1_irq_reset(struct drm_i915_private *dev_priv)
>   {
>   	struct intel_gt *gt = to_gt(dev_priv);
>   	struct intel_uncore *uncore = gt->uncore;
> +	unsigned int i;
>   
>   	dg1_master_intr_disable(dev_priv->uncore.regs);
>   
> -	gen11_gt_irq_reset(gt);
> -	gen11_display_irq_reset(dev_priv);
> +	for_each_gt(gt, dev_priv, i) {
> +		gen11_gt_irq_reset(gt);
>   
> -	GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
> -	GEN3_IRQ_RESET(uncore, GEN8_PCU_);
> +		uncore = gt->uncore;
> +		GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
> +		GEN3_IRQ_RESET(uncore, GEN8_PCU_);
> +	}
> +
> +	gen11_display_irq_reset(dev_priv);
>   }
>   
>   void gen8_irq_power_well_post_enable(struct drm_i915_private *dev_priv,
> @@ -3425,13 +3430,16 @@ static void gen11_irq_postinstall(struct drm_i915_private *dev_priv)
>   
>   static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
>   {
> -	struct intel_gt *gt = to_gt(dev_priv);
> -	struct intel_uncore *uncore = gt->uncore;
>   	u32 gu_misc_masked = GEN11_GU_MISC_GSE;
> +	struct intel_gt *gt;
> +	unsigned int i;
>   
> -	gen11_gt_irq_postinstall(gt);
> +	for_each_gt(gt, dev_priv, i) {
> +		gen11_gt_irq_postinstall(gt);
>   
> -	GEN3_IRQ_INIT(uncore, GEN11_GU_MISC_, ~gu_misc_masked, gu_misc_masked);
> +		GEN3_IRQ_INIT(gt->uncore, GEN11_GU_MISC_, ~gu_misc_masked,
> +			      gu_misc_masked);
> +	}
>   
>   	if (HAS_DISPLAY(dev_priv)) {
>   		icp_irq_postinstall(dev_priv);
> @@ -3440,8 +3448,8 @@ static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
>   				   GEN11_DISPLAY_IRQ_ENABLE);
>   	}
>   
> -	dg1_master_intr_enable(uncore->regs);
> -	intel_uncore_posting_read(uncore, DG1_MSTR_TILE_INTR);
> +	dg1_master_intr_enable(to_gt(dev_priv)->uncore->regs);
> +	intel_uncore_posting_read(to_gt(dev_priv)->uncore, DG1_MSTR_TILE_INTR);
>   }
>   
>   static void cherryview_irq_postinstall(struct drm_i915_private *dev_priv)
Andi Shyti April 13, 2023, 1:56 p.m. UTC | #2
Hi Tvrtko,

(I forgot to CC Daniele)

On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
> 
> On 13/04/2023 10:20, Andi Shyti wrote:
> > From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > 
> > In multitile systems IRQ need to be reset and enabled per GT.
> > 
> > Although in MTL the GUnit misc interrupts register set are
> > available only in GT-0, we need to loop through all the GT's
> > in order to initialize the media engine which lies on a different
> > GT.
> > 
> > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > ---
> > Hi,
> > 
> > proposing again this patch, apparently GuC needs this patch to
> > initialize the media GT.
> 
> What is the resolution for Matt's concern that this is wrong for MTL?

There are two explanations, one easy and one less easy.

The easy one: without this patch i915 doesn't boot on MTL!(*)

The second explanation is that in MTL the media engine has it's
own set of misc irq's registers and those are on a different GT
(Daniele pointed this out).

I sent this patch not to bypass any review, but to restart the
discussion as this patch was just dropped.

Thanks,
Andi


(*)
[drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
[drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
[drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
[drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
[drm] *ERROR* GT1: Enabling uc failed (-5)
[drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
Tvrtko Ursulin April 13, 2023, 2:16 p.m. UTC | #3
On 13/04/2023 14:56, Andi Shyti wrote:
> Hi Tvrtko,
> 
> (I forgot to CC Daniele)
> 
> On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
>>
>> On 13/04/2023 10:20, Andi Shyti wrote:
>>> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
>>>
>>> In multitile systems IRQ need to be reset and enabled per GT.
>>>
>>> Although in MTL the GUnit misc interrupts register set are
>>> available only in GT-0, we need to loop through all the GT's
>>> in order to initialize the media engine which lies on a different
>>> GT.
>>>
>>> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
>>> ---
>>> Hi,
>>>
>>> proposing again this patch, apparently GuC needs this patch to
>>> initialize the media GT.
>>
>> What is the resolution for Matt's concern that this is wrong for MTL?
> 
> There are two explanations, one easy and one less easy.
> 
> The easy one: without this patch i915 doesn't boot on MTL!(*)
> 
> The second explanation is that in MTL the media engine has it's
> own set of misc irq's registers and those are on a different GT
> (Daniele pointed this out).
> 
> I sent this patch not to bypass any review, but to restart the
> discussion as this patch was just dropped.

I see. It does not sound too challenging to handle with a little bit of 
refactoring. Move writes engine registers to a helper and add a MTL 
specific reset/postinstall? Then MTL can do the engine ones outside the 
for_each_gt loop and the replicated ones under it. Give or take, I did 
not look into the details.

Regards,

Tvrtko
Matt Roper April 13, 2023, 3:52 p.m. UTC | #4
On Thu, Apr 13, 2023 at 03:56:21PM +0200, Andi Shyti wrote:
> Hi Tvrtko,
> 
> (I forgot to CC Daniele)
> 
> On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
> > 
> > On 13/04/2023 10:20, Andi Shyti wrote:
> > > From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > 
> > > In multitile systems IRQ need to be reset and enabled per GT.
> > > 
> > > Although in MTL the GUnit misc interrupts register set are
> > > available only in GT-0, we need to loop through all the GT's
> > > in order to initialize the media engine which lies on a different
> > > GT.
> > > 
> > > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > ---
> > > Hi,
> > > 
> > > proposing again this patch, apparently GuC needs this patch to
> > > initialize the media GT.
> > 
> > What is the resolution for Matt's concern that this is wrong for MTL?
> 
> There are two explanations, one easy and one less easy.
> 
> The easy one: without this patch i915 doesn't boot on MTL!(*)
> 
> The second explanation is that in MTL the media engine has it's
> own set of misc irq's registers and those are on a different GT
> (Daniele pointed this out).

Assuming you're talking about MTL_GUC_MGUC_INTR_MASK, that's not true;
it's just a single sgunit register (0x1900e8) that has different
bitfields for the primary GuC and the media GuC.  So I still think we
should avoid looping over GTs; it's actually much simpler to handle
things in a single pass since we can just determine the single register
value once (all fields) and write it directly, instead of doing two
separate RMW updates to the same register to try to avoid clobbering
the other GuC's settings.

For pre-MTL platforms, it's the same register, except that the bitfield
now devoted to the media GuC was previously used for something else
(scatter/gather).


Matt

> 
> I sent this patch not to bypass any review, but to restart the
> discussion as this patch was just dropped.
> 
> Thanks,
> Andi
> 
> 
> (*)
> [drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
> [drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
> [drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
> [drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
> [drm] *ERROR* GT1: Enabling uc failed (-5)
> [drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
Zanoni, Paulo R April 13, 2023, 3:58 p.m. UTC | #5
On Thu, 2023-04-13 at 11:20 +0200, Andi Shyti wrote:
> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
Hi

https://en.wikipedia.org/wiki/Ship_of_Theseus

My original patch was written in 2018. Since then, the implementation
has been rebased and changed multiple times, the commit message has
been changed, the subject line has been changed, yet none of that is
documented in the patch's revision history: it was all removed and it
now looks like I'm the author of the version that was submitted this
month. I never liked this "erase the internal patch's changelog before
submitting it upstream for the first time" process, I think it erases
crucial information and misleads people.

I know I said something different earlier in private, but after further
reflection, I concluded I do not feel comfortable having my name as the
Author or as the Signed-off-by in this patch. Please remove it. You can
add a "Based-on-patch-by: Paulo Zanoni <paulo.r.zanoni@intel.com>" if
you want, but that's not necessary.

This should also help in case some bug is bisected to this patch, then
I won't need to spend time researching who I should forward the emails
to.

Thanks,
Paulo

> 
> In multitile systems IRQ need to be reset and enabled per GT.
> 
> Although in MTL the GUnit misc interrupts register set are
> available only in GT-0, we need to loop through all the GT's
> in order to initialize the media engine which lies on a different
> GT.
> 
> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> ---
> Hi,
> 
> proposing again this patch, apparently GuC needs this patch to
> initialize the media GT.
> 
> Andi
> 
> Changelog
> =========
> v1 -> v2
>  - improve description in the commit log.
> 
>  drivers/gpu/drm/i915/i915_irq.c | 28 ++++++++++++++++++----------
>  1 file changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d24bdea65a3dc..524d64bf5d186 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2764,14 +2764,19 @@ static void dg1_irq_reset(struct drm_i915_private *dev_priv)
>  {
>  	struct intel_gt *gt = to_gt(dev_priv);
>  	struct intel_uncore *uncore = gt->uncore;
> +	unsigned int i;
>  
> 
> 
> 
>  	dg1_master_intr_disable(dev_priv->uncore.regs);
>  
> 
> 
> 
> -	gen11_gt_irq_reset(gt);
> -	gen11_display_irq_reset(dev_priv);
> +	for_each_gt(gt, dev_priv, i) {
> +		gen11_gt_irq_reset(gt);
>  
> 
> 
> 
> -	GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
> -	GEN3_IRQ_RESET(uncore, GEN8_PCU_);
> +		uncore = gt->uncore;
> +		GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
> +		GEN3_IRQ_RESET(uncore, GEN8_PCU_);
> +	}
> +
> +	gen11_display_irq_reset(dev_priv);
>  }
>  
> 
> 
> 
>  void gen8_irq_power_well_post_enable(struct drm_i915_private *dev_priv,
> @@ -3425,13 +3430,16 @@ static void gen11_irq_postinstall(struct drm_i915_private *dev_priv)
>  
> 
> 
> 
>  static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
>  {
> -	struct intel_gt *gt = to_gt(dev_priv);
> -	struct intel_uncore *uncore = gt->uncore;
>  	u32 gu_misc_masked = GEN11_GU_MISC_GSE;
> +	struct intel_gt *gt;
> +	unsigned int i;
>  
> 
> 
> 
> -	gen11_gt_irq_postinstall(gt);
> +	for_each_gt(gt, dev_priv, i) {
> +		gen11_gt_irq_postinstall(gt);
>  
> 
> 
> 
> -	GEN3_IRQ_INIT(uncore, GEN11_GU_MISC_, ~gu_misc_masked, gu_misc_masked);
> +		GEN3_IRQ_INIT(gt->uncore, GEN11_GU_MISC_, ~gu_misc_masked,
> +			      gu_misc_masked);
> +	}
>  
> 
> 
> 
>  	if (HAS_DISPLAY(dev_priv)) {
>  		icp_irq_postinstall(dev_priv);
> @@ -3440,8 +3448,8 @@ static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
>  				   GEN11_DISPLAY_IRQ_ENABLE);
>  	}
>  
> 
> 
> 
> -	dg1_master_intr_enable(uncore->regs);
> -	intel_uncore_posting_read(uncore, DG1_MSTR_TILE_INTR);
> +	dg1_master_intr_enable(to_gt(dev_priv)->uncore->regs);
> +	intel_uncore_posting_read(to_gt(dev_priv)->uncore, DG1_MSTR_TILE_INTR);
>  }
>  
> 
> 
> 
>  static void cherryview_irq_postinstall(struct drm_i915_private *dev_priv)
Daniele Ceraolo Spurio April 13, 2023, 4:03 p.m. UTC | #6
On 4/13/2023 8:52 AM, Matt Roper wrote:
> On Thu, Apr 13, 2023 at 03:56:21PM +0200, Andi Shyti wrote:
>> Hi Tvrtko,
>>
>> (I forgot to CC Daniele)
>>
>> On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
>>> On 13/04/2023 10:20, Andi Shyti wrote:
>>>> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
>>>>
>>>> In multitile systems IRQ need to be reset and enabled per GT.
>>>>
>>>> Although in MTL the GUnit misc interrupts register set are
>>>> available only in GT-0, we need to loop through all the GT's
>>>> in order to initialize the media engine which lies on a different
>>>> GT.
>>>>
>>>> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
>>>> ---
>>>> Hi,
>>>>
>>>> proposing again this patch, apparently GuC needs this patch to
>>>> initialize the media GT.
>>> What is the resolution for Matt's concern that this is wrong for MTL?
>> There are two explanations, one easy and one less easy.
>>
>> The easy one: without this patch i915 doesn't boot on MTL!(*)
>>
>> The second explanation is that in MTL the media engine has it's
>> own set of misc irq's registers and those are on a different GT
>> (Daniele pointed this out).
> Assuming you're talking about MTL_GUC_MGUC_INTR_MASK, that's not true;
> it's just a single sgunit register (0x1900e8) that has different
> bitfields for the primary GuC and the media GuC.  So I still think we
> should avoid looping over GTs; it's actually much simpler to handle
> things in a single pass since we can just determine the single register
> value once (all fields) and write it directly, instead of doing two
> separate RMW updates to the same register to try to avoid clobbering
> the other GuC's settings.
>
> For pre-MTL platforms, it's the same register, except that the bitfield
> now devoted to the media GuC was previously used for something else
> (scatter/gather).

It's not just the GuC, the VCS/VECS engine programming is also tied to 
the media GT (via the HAS_ENGINE checks). It looks like we 
unconditionally program VCS 0 and 2, so it'll still work for MTL, but if 
we get a device with more VCS engines it'll break. Maybe we can add a 
MTL version of the function that just programs everything 
unconditionally? Going forward it should be ok to program things for 
engines that don't exist, but I'm not sure we can do that for older 
platforms that came before the extra engines were ever defined in HW.

Daniele

>
>
> Matt
>
>> I sent this patch not to bypass any review, but to restart the
>> discussion as this patch was just dropped.
>>
>> Thanks,
>> Andi
>>
>>
>> (*)
>> [drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
>> [drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
>> [drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
>> [drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
>> [drm] *ERROR* GT1: Enabling uc failed (-5)
>> [drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
Andi Shyti April 13, 2023, 4:19 p.m. UTC | #7
On Thu, Apr 13, 2023 at 09:03:29AM -0700, Ceraolo Spurio, Daniele wrote:
> 
> 
> On 4/13/2023 8:52 AM, Matt Roper wrote:
> > On Thu, Apr 13, 2023 at 03:56:21PM +0200, Andi Shyti wrote:
> > > Hi Tvrtko,
> > > 
> > > (I forgot to CC Daniele)
> > > 
> > > On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
> > > > On 13/04/2023 10:20, Andi Shyti wrote:
> > > > > From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > 
> > > > > In multitile systems IRQ need to be reset and enabled per GT.
> > > > > 
> > > > > Although in MTL the GUnit misc interrupts register set are
> > > > > available only in GT-0, we need to loop through all the GT's
> > > > > in order to initialize the media engine which lies on a different
> > > > > GT.
> > > > > 
> > > > > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > > > ---
> > > > > Hi,
> > > > > 
> > > > > proposing again this patch, apparently GuC needs this patch to
> > > > > initialize the media GT.
> > > > What is the resolution for Matt's concern that this is wrong for MTL?
> > > There are two explanations, one easy and one less easy.
> > > 
> > > The easy one: without this patch i915 doesn't boot on MTL!(*)
> > > 
> > > The second explanation is that in MTL the media engine has it's
> > > own set of misc irq's registers and those are on a different GT
> > > (Daniele pointed this out).
> > Assuming you're talking about MTL_GUC_MGUC_INTR_MASK, that's not true;
> > it's just a single sgunit register (0x1900e8) that has different
> > bitfields for the primary GuC and the media GuC.  So I still think we
> > should avoid looping over GTs; it's actually much simpler to handle
> > things in a single pass since we can just determine the single register
> > value once (all fields) and write it directly, instead of doing two
> > separate RMW updates to the same register to try to avoid clobbering
> > the other GuC's settings.

if we handle exceptions in a single pass wouldn't we have many
exceptions to handle in the long run?

> > For pre-MTL platforms, it's the same register, except that the bitfield
> > now devoted to the media GuC was previously used for something else
> > (scatter/gather).
> 
> It's not just the GuC, the VCS/VECS engine programming is also tied to the
> media GT (via the HAS_ENGINE checks). It looks like we unconditionally
> program VCS 0 and 2, so it'll still work for MTL, but if we get a device
> with more VCS engines it'll break. Maybe we can add a MTL version of the
> function that just programs everything unconditionally? Going forward it
> should be ok to program things for engines that don't exist, but I'm not
> sure we can do that for older platforms that came before the extra engines
> were ever defined in HW.

This is more or less what Tvrtko has suggested, as well. Looks to
me like replicating some code... anyway, I will try and see how
it looks like.

Andi

PS Thanks Matt, Daniele and Tvrtko for the feedback.

> Daniele
> 
> > 
> > 
> > Matt
> > 
> > > I sent this patch not to bypass any review, but to restart the
> > > discussion as this patch was just dropped.
> > > 
> > > Thanks,
> > > Andi
> > > 
> > > 
> > > (*)
> > > [drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
> > > [drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
> > > [drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
> > > [drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
> > > [drm] *ERROR* GT1: Enabling uc failed (-5)
> > > [drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
Andi Shyti April 13, 2023, 4:24 p.m. UTC | #8
Hi Paulo,

> https://en.wikipedia.org/wiki/Ship_of_Theseus
> 
> My original patch was written in 2018. Since then, the implementation
> has been rebased and changed multiple times, the commit message has
> been changed, the subject line has been changed, yet none of that is
> documented in the patch's revision history: it was all removed and it
> now looks like I'm the author of the version that was submitted this
> month. I never liked this "erase the internal patch's changelog before
> submitting it upstream for the first time" process, I think it erases
> crucial information and misleads people.
> 
> I know I said something different earlier in private, but after further
> reflection, I concluded I do not feel comfortable having my name as the
> Author or as the Signed-off-by in this patch. Please remove it. You can
> add a "Based-on-patch-by: Paulo Zanoni <paulo.r.zanoni@intel.com>" if
> you want, but that's not necessary.
> 
> This should also help in case some bug is bisected to this patch, then
> I won't need to spend time researching who I should forward the emails
> to.

Sure! When porting and back porting patches I try to preserve as
much as possible the original authorship.

But, if you feel more comfortable, I can take it on me.

Andi

> Thanks,
> Paulo
> 
> > 
> > In multitile systems IRQ need to be reset and enabled per GT.
> > 
> > Although in MTL the GUnit misc interrupts register set are
> > available only in GT-0, we need to loop through all the GT's
> > in order to initialize the media engine which lies on a different
> > GT.
> > 
> > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Matt Roper April 13, 2023, 4:29 p.m. UTC | #9
On Thu, Apr 13, 2023 at 09:03:29AM -0700, Ceraolo Spurio, Daniele wrote:
> 
> 
> On 4/13/2023 8:52 AM, Matt Roper wrote:
> > On Thu, Apr 13, 2023 at 03:56:21PM +0200, Andi Shyti wrote:
> > > Hi Tvrtko,
> > > 
> > > (I forgot to CC Daniele)
> > > 
> > > On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
> > > > On 13/04/2023 10:20, Andi Shyti wrote:
> > > > > From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > 
> > > > > In multitile systems IRQ need to be reset and enabled per GT.
> > > > > 
> > > > > Although in MTL the GUnit misc interrupts register set are
> > > > > available only in GT-0, we need to loop through all the GT's
> > > > > in order to initialize the media engine which lies on a different
> > > > > GT.
> > > > > 
> > > > > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > > > ---
> > > > > Hi,
> > > > > 
> > > > > proposing again this patch, apparently GuC needs this patch to
> > > > > initialize the media GT.
> > > > What is the resolution for Matt's concern that this is wrong for MTL?
> > > There are two explanations, one easy and one less easy.
> > > 
> > > The easy one: without this patch i915 doesn't boot on MTL!(*)
> > > 
> > > The second explanation is that in MTL the media engine has it's
> > > own set of misc irq's registers and those are on a different GT
> > > (Daniele pointed this out).
> > Assuming you're talking about MTL_GUC_MGUC_INTR_MASK, that's not true;
> > it's just a single sgunit register (0x1900e8) that has different
> > bitfields for the primary GuC and the media GuC.  So I still think we
> > should avoid looping over GTs; it's actually much simpler to handle
> > things in a single pass since we can just determine the single register
> > value once (all fields) and write it directly, instead of doing two
> > separate RMW updates to the same register to try to avoid clobbering
> > the other GuC's settings.
> > 
> > For pre-MTL platforms, it's the same register, except that the bitfield
> > now devoted to the media GuC was previously used for something else
> > (scatter/gather).
> 
> It's not just the GuC, the VCS/VECS engine programming is also tied to the
> media GT (via the HAS_ENGINE checks). It looks like we unconditionally
> program VCS 0 and 2, so it'll still work for MTL, but if we get a device
> with more VCS engines it'll break. Maybe we can add a MTL version of the
> function that just programs everything unconditionally? Going forward it
> should be ok to program things for engines that don't exist, but I'm not
> sure we can do that for older platforms that came before the extra engines
> were ever defined in HW.

Right, so I think the engine handling is already correct for MTL today;
the main concern would be how it might need to change for other future
platforms if more media engines show back up on a media GT.  I think we
can wait and cross that bridge if/when we get to it.  With focus moving
over to the Xe KMD, we might be on a completely different driver by the
time the hardware adds back in more media engines that aren't already
covered unconditionally.


Matt

> 
> Daniele
> 
> > 
> > 
> > Matt
> > 
> > > I sent this patch not to bypass any review, but to restart the
> > > discussion as this patch was just dropped.
> > > 
> > > Thanks,
> > > Andi
> > > 
> > > 
> > > (*)
> > > [drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
> > > [drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
> > > [drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
> > > [drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
> > > [drm] *ERROR* GT1: Enabling uc failed (-5)
> > > [drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
>
Matt Roper April 13, 2023, 4:38 p.m. UTC | #10
On Thu, Apr 13, 2023 at 06:19:16PM +0200, Andi Shyti wrote:
> On Thu, Apr 13, 2023 at 09:03:29AM -0700, Ceraolo Spurio, Daniele wrote:
> > 
> > 
> > On 4/13/2023 8:52 AM, Matt Roper wrote:
> > > On Thu, Apr 13, 2023 at 03:56:21PM +0200, Andi Shyti wrote:
> > > > Hi Tvrtko,
> > > > 
> > > > (I forgot to CC Daniele)
> > > > 
> > > > On Thu, Apr 13, 2023 at 11:41:28AM +0100, Tvrtko Ursulin wrote:
> > > > > On 13/04/2023 10:20, Andi Shyti wrote:
> > > > > > From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > > 
> > > > > > In multitile systems IRQ need to be reset and enabled per GT.
> > > > > > 
> > > > > > Although in MTL the GUnit misc interrupts register set are
> > > > > > available only in GT-0, we need to loop through all the GT's
> > > > > > in order to initialize the media engine which lies on a different
> > > > > > GT.
> > > > > > 
> > > > > > Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > > > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > > > > ---
> > > > > > Hi,
> > > > > > 
> > > > > > proposing again this patch, apparently GuC needs this patch to
> > > > > > initialize the media GT.
> > > > > What is the resolution for Matt's concern that this is wrong for MTL?
> > > > There are two explanations, one easy and one less easy.
> > > > 
> > > > The easy one: without this patch i915 doesn't boot on MTL!(*)
> > > > 
> > > > The second explanation is that in MTL the media engine has it's
> > > > own set of misc irq's registers and those are on a different GT
> > > > (Daniele pointed this out).
> > > Assuming you're talking about MTL_GUC_MGUC_INTR_MASK, that's not true;
> > > it's just a single sgunit register (0x1900e8) that has different
> > > bitfields for the primary GuC and the media GuC.  So I still think we
> > > should avoid looping over GTs; it's actually much simpler to handle
> > > things in a single pass since we can just determine the single register
> > > value once (all fields) and write it directly, instead of doing two
> > > separate RMW updates to the same register to try to avoid clobbering
> > > the other GuC's settings.
> 
> if we handle exceptions in a single pass wouldn't we have many
> exceptions to handle in the long run?

I don't think so, it basically boils down to something along the lines
of

        if (MEDIA_VER(i915) >= 13)
                val = HIGH_BITS | LOW_BITS;
        else
                val = HIGH_BITS;

        ...

        intel_uncore_write(val);

which isn't really any more complicated than today's logic:

        called for each gt {
                ...

                if (gt is MEDIA)
                        bits = LOW_BITS;
                else
                        bits = HIGH_BITS;

                ...

                intel_uncore_rmw(bits);
        }


Matt

> 
> > > For pre-MTL platforms, it's the same register, except that the bitfield
> > > now devoted to the media GuC was previously used for something else
> > > (scatter/gather).
> > 
> > It's not just the GuC, the VCS/VECS engine programming is also tied to the
> > media GT (via the HAS_ENGINE checks). It looks like we unconditionally
> > program VCS 0 and 2, so it'll still work for MTL, but if we get a device
> > with more VCS engines it'll break. Maybe we can add a MTL version of the
> > function that just programs everything unconditionally? Going forward it
> > should be ok to program things for engines that don't exist, but I'm not
> > sure we can do that for older platforms that came before the extra engines
> > were ever defined in HW.
> 
> This is more or less what Tvrtko has suggested, as well. Looks to
> me like replicating some code... anyway, I will try and see how
> it looks like.
> 
> Andi
> 
> PS Thanks Matt, Daniele and Tvrtko for the feedback.
> 
> > Daniele
> > 
> > > 
> > > 
> > > Matt
> > > 
> > > > I sent this patch not to bypass any review, but to restart the
> > > > discussion as this patch was just dropped.
> > > > 
> > > > Thanks,
> > > > Andi
> > > > 
> > > > 
> > > > (*)
> > > > [drm] *ERROR* GT1: GUC: CT: No response for request 0x550a (fence 7)
> > > > [drm] *ERROR* GT1: GUC: CT: Sending action 0x550a failed (-ETIMEDOUT) status=0X0
> > > > [drm] *ERROR* GT1: GUC: Failed to enable usage stats: -ETIMEDOUT
> > > > [drm] *ERROR* GT1: GuC initialization failed -ETIMEDOUT
> > > > [drm] *ERROR* GT1: Enabling uc failed (-5)
> > > > [drm] *ERROR* GT1: Failed to initialize GPU, declaring it wedged!
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d24bdea65a3dc..524d64bf5d186 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2764,14 +2764,19 @@  static void dg1_irq_reset(struct drm_i915_private *dev_priv)
 {
 	struct intel_gt *gt = to_gt(dev_priv);
 	struct intel_uncore *uncore = gt->uncore;
+	unsigned int i;
 
 	dg1_master_intr_disable(dev_priv->uncore.regs);
 
-	gen11_gt_irq_reset(gt);
-	gen11_display_irq_reset(dev_priv);
+	for_each_gt(gt, dev_priv, i) {
+		gen11_gt_irq_reset(gt);
 
-	GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
-	GEN3_IRQ_RESET(uncore, GEN8_PCU_);
+		uncore = gt->uncore;
+		GEN3_IRQ_RESET(uncore, GEN11_GU_MISC_);
+		GEN3_IRQ_RESET(uncore, GEN8_PCU_);
+	}
+
+	gen11_display_irq_reset(dev_priv);
 }
 
 void gen8_irq_power_well_post_enable(struct drm_i915_private *dev_priv,
@@ -3425,13 +3430,16 @@  static void gen11_irq_postinstall(struct drm_i915_private *dev_priv)
 
 static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
 {
-	struct intel_gt *gt = to_gt(dev_priv);
-	struct intel_uncore *uncore = gt->uncore;
 	u32 gu_misc_masked = GEN11_GU_MISC_GSE;
+	struct intel_gt *gt;
+	unsigned int i;
 
-	gen11_gt_irq_postinstall(gt);
+	for_each_gt(gt, dev_priv, i) {
+		gen11_gt_irq_postinstall(gt);
 
-	GEN3_IRQ_INIT(uncore, GEN11_GU_MISC_, ~gu_misc_masked, gu_misc_masked);
+		GEN3_IRQ_INIT(gt->uncore, GEN11_GU_MISC_, ~gu_misc_masked,
+			      gu_misc_masked);
+	}
 
 	if (HAS_DISPLAY(dev_priv)) {
 		icp_irq_postinstall(dev_priv);
@@ -3440,8 +3448,8 @@  static void dg1_irq_postinstall(struct drm_i915_private *dev_priv)
 				   GEN11_DISPLAY_IRQ_ENABLE);
 	}
 
-	dg1_master_intr_enable(uncore->regs);
-	intel_uncore_posting_read(uncore, DG1_MSTR_TILE_INTR);
+	dg1_master_intr_enable(to_gt(dev_priv)->uncore->regs);
+	intel_uncore_posting_read(to_gt(dev_priv)->uncore, DG1_MSTR_TILE_INTR);
 }
 
 static void cherryview_irq_postinstall(struct drm_i915_private *dev_priv)