diff mbox series

amdgpu: disable amdgpu_dpm on THTF-SW831-1W-DS25_MB board

Message ID 3EA7C2B9E8C4D00A+20240828105938.37674-1-wangyuli@uniontech.com (mailing list archive)
State New, archived
Headers show
Series amdgpu: disable amdgpu_dpm on THTF-SW831-1W-DS25_MB board | expand

Commit Message

WangYuli Aug. 28, 2024, 10:59 a.m. UTC
From: wenlunpeng <wenlunpeng@uniontech.com>

The quirk is for reboot-stability.

A device reboot stress test has been observed to cause
random system hangs when amdgpu_dpm is enabled.

Disabling amdgpu_dpm can fix this.

However, a boot-param can still overwrite it to enable
amdgpu_dpm.

Serial log when error occurs:
...
Console: switching to colour frame buffer device 160x45
amdgpu 0000:01:00.0: fb0: amdgpudrmfb frame buffer device
[drm:amdgpu_device_ip_late_init] *ERROR* late_init of IP block <si_dpm> failed -22
amdgpu 0000:01:00.0: amdgpu_device_ip_late_init failed
amdgpu 0000:01:00.0: Fatal error during GPU init
[drm] amdgpu: finishing device.
Console: switching to colour dummy device 80x25
...

Signed-off-by: wenlunpeng <wenlunpeng@uniontech.com>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

Comments

Mario Limonciello Aug. 28, 2024, 3:19 p.m. UTC | #1
On 8/28/2024 05:59, WangYuli wrote:
> From: wenlunpeng <wenlunpeng@uniontech.com>
> 
> The quirk is for reboot-stability.
> 
> A device reboot stress test has been observed to cause
> random system hangs when amdgpu_dpm is enabled.
> 
> Disabling amdgpu_dpm can fix this.
> 
> However, a boot-param can still overwrite it to enable
> amdgpu_dpm.
> 
> Serial log when error occurs:
> ...
> Console: switching to colour frame buffer device 160x45
> amdgpu 0000:01:00.0: fb0: amdgpudrmfb frame buffer device
> [drm:amdgpu_device_ip_late_init] *ERROR* late_init of IP block <si_dpm> failed -22
> amdgpu 0000:01:00.0: amdgpu_device_ip_late_init failed
> amdgpu 0000:01:00.0: Fatal error during GPU init
> [drm] amdgpu: finishing device.
> Console: switching to colour dummy device 80x25
> ...

This is production hardware?

Have you already checked whether a BIOS upgrade for the device could 
help this issue?

> 
> Signed-off-by: wenlunpeng <wenlunpeng@uniontech.com>
> Signed-off-by: WangYuli <wangyuli@uniontech.com>

Just to clarify did you guys co-work on this patch, or are you 
submitting on behalf of wenlunpeng?  It right now shows up as you 
submitting on behalf of wenlunpeng.  If you co-worked on it you should 
also use a Co-Developed-by tag.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 23 +++++++++++++++++++++++
>   1 file changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 094498a0964b..81716fcac7cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -32,6 +32,7 @@
>   #include <drm/drm_vblank.h>
>   
>   #include <linux/cc_platform.h>
> +#include <linux/dmi.h>
>   #include <linux/dynamic_debug.h>
>   #include <linux/module.h>
>   #include <linux/mmu_notifier.h>
> @@ -3023,10 +3024,32 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>   	.dev_groups = amdgpu_sysfs_groups,
>   };
>   
> +static int quirk_set_amdgpu_dpm_0(const struct dmi_system_id *dmi)
> +{
> +	amdgpu_dpm = 0;
> +	pr_info("Identified '%s', set amdgpu_dpm to 0.\n", dmi->ident);
> +	return 1;
> +}
> +
> +static const struct dmi_system_id amdgpu_quirklist[] = {
> +	{
> +		.ident = "DS25 Desktop",
> +		.matches = {
> +			DMI_MATCH(DMI_BOARD_NAME, "THTF-SW831-1W-DS25_MB"),

As this is suspected to be a BIOS issue, I would like to better 
understand if the BIOS upgrade fixes it.  If it does but you would still 
like a quirk for the system it should include the BIOS version here.

> +		},
> +		.callback = quirk_set_amdgpu_dpm_0,
> +	},
> +	{}
> +};
> +
>   static int __init amdgpu_init(void)
>   {
>   	int r;
>   
> +	/* quirks for some hardware, applied only when it's untouched */
> +	if (amdgpu_dpm == -1)
> +		dmi_check_system(amdgpu_quirklist);
> +
>   	if (drm_firmware_drivers_only())
>   		return -EINVAL;
>
Alex Deucher Aug. 28, 2024, 3:30 p.m. UTC | #2
On Wed, Aug 28, 2024 at 7:28 AM WangYuli <wangyuli@uniontech.com> wrote:
>
> From: wenlunpeng <wenlunpeng@uniontech.com>
>
> The quirk is for reboot-stability.
>
> A device reboot stress test has been observed to cause
> random system hangs when amdgpu_dpm is enabled.
>
> Disabling amdgpu_dpm can fix this.
>
> However, a boot-param can still overwrite it to enable
> amdgpu_dpm.
>
> Serial log when error occurs:
> ...
> Console: switching to colour frame buffer device 160x45
> amdgpu 0000:01:00.0: fb0: amdgpudrmfb frame buffer device
> [drm:amdgpu_device_ip_late_init] *ERROR* late_init of IP block <si_dpm> failed -22
> amdgpu 0000:01:00.0: amdgpu_device_ip_late_init failed
> amdgpu 0000:01:00.0: Fatal error during GPU init
> [drm] amdgpu: finishing device.
> Console: switching to colour dummy device 80x25
> ...
>
> Signed-off-by: wenlunpeng <wenlunpeng@uniontech.com>
> Signed-off-by: WangYuli <wangyuli@uniontech.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 23 +++++++++++++++++++++++
>  1 file changed, 23 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 094498a0964b..81716fcac7cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -32,6 +32,7 @@
>  #include <drm/drm_vblank.h>
>
>  #include <linux/cc_platform.h>
> +#include <linux/dmi.h>
>  #include <linux/dynamic_debug.h>
>  #include <linux/module.h>
>  #include <linux/mmu_notifier.h>
> @@ -3023,10 +3024,32 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>         .dev_groups = amdgpu_sysfs_groups,
>  };
>
> +static int quirk_set_amdgpu_dpm_0(const struct dmi_system_id *dmi)
> +{
> +       amdgpu_dpm = 0;

This will disable dpm on all devices that you might install on this
platform.  If this is specific to a particular platform and board
combination, it might be better to check the platform in the
dpm_init() code for the specific chip that is problematic.
Additionally, disabling dpm will result in boot clocks which means
performance will be very low.

Alex

> +       pr_info("Identified '%s', set amdgpu_dpm to 0.\n", dmi->ident);
> +       return 1;
> +}
> +
> +static const struct dmi_system_id amdgpu_quirklist[] = {
> +       {
> +               .ident = "DS25 Desktop",
> +               .matches = {
> +                       DMI_MATCH(DMI_BOARD_NAME, "THTF-SW831-1W-DS25_MB"),
> +               },
> +               .callback = quirk_set_amdgpu_dpm_0,
> +       },
> +       {}
> +};
> +
>  static int __init amdgpu_init(void)
>  {
>         int r;
>
> +       /* quirks for some hardware, applied only when it's untouched */
> +       if (amdgpu_dpm == -1)
> +               dmi_check_system(amdgpu_quirklist);
> +
>         if (drm_firmware_drivers_only())
>                 return -EINVAL;
>
> --
> 2.43.4
>
WangYuli Aug. 28, 2024, 3:37 p.m. UTC | #3
On 2024/8/28 23:19, Mario Limonciello wrote:
> This is production hardware?

Unfortunately, this device was released quite a while back.
>
> Have you already checked whether a BIOS upgrade for the device could 
> help this issue?


Sadly, there's no BIOS update to address this problem. It seems to be a 
persistent issue across all BIOS versions for this motherboard.

>
> Just to clarify did you guys co-work on this patch, or are you 
> submitting on behalf of wenlunpeng?  It right now shows up as you 
> submitting on behalf of wenlunpeng.  If you co-worked on it you should 
> also use a Co-Developed-by tag.


No, the entirety of this bugfix is attributable to wenlunpeng.

>
> As this is suspected to be a BIOS issue, I would like to better 
> understand if the BIOS upgrade fixes it.  If it does but you would 
> still like a quirk for the system it should include the BIOS version 
> here.
>
>
Unfortunately again. There's not much we as kernel developers can do 
when devices behave unpredictably.
WangYuli Aug. 28, 2024, 3:47 p.m. UTC | #4
On 2024/8/28 23:30, Alex Deucher wrote:
> On Wed, Aug 28, 2024 at 7:28 AM WangYuli <wangyuli@uniontech.com> wrote:
>
> This will disable dpm on all devices that you might install on this
> platform.  If this is specific to a particular platform and board
> combination, it might be better to check the platform in the
> dpm_init() code for the specific chip that is problematic.
> Additionally, disabling dpm will result in boot clocks which means
> performance will be very low.
>
> Alex

This motherboard model doesn't have combinations with different 
platforms or chipsets now.Their model numbers are unique,so it seems 
unnecessary to add extra judgment.
Alex Deucher Aug. 28, 2024, 4:14 p.m. UTC | #5
On Wed, Aug 28, 2024 at 11:47 AM WangYuli <wangyuli@uniontech.com> wrote:
>
>
> On 2024/8/28 23:30, Alex Deucher wrote:
> > On Wed, Aug 28, 2024 at 7:28 AM WangYuli <wangyuli@uniontech.com> wrote:
> >
> > This will disable dpm on all devices that you might install on this
> > platform.  If this is specific to a particular platform and board
> > combination, it might be better to check the platform in the
> > dpm_init() code for the specific chip that is problematic.
> > Additionally, disabling dpm will result in boot clocks which means
> > performance will be very low.
> >
> > Alex
>
> This motherboard model doesn't have combinations with different
> platforms or chipsets now.Their model numbers are unique,so it seems
> unnecessary to add extra judgment.

The error message looks to be from an SI board which is a dGPU.  Is
that dGPU integrated into the motherboard?  Does the motherboard have
PCIe slots?  If there are PCIe slots you could presumably put any GPU
into it and if you did, dpm would be disabled by default.

Alex
Mario Limonciello Aug. 28, 2024, 9:31 p.m. UTC | #6
On 8/28/2024 11:14, Alex Deucher wrote:
> On Wed, Aug 28, 2024 at 11:47 AM WangYuli <wangyuli@uniontech.com> wrote:
>>
>>
>> On 2024/8/28 23:30, Alex Deucher wrote:
>>> On Wed, Aug 28, 2024 at 7:28 AM WangYuli <wangyuli@uniontech.com> wrote:
>>>
>>> This will disable dpm on all devices that you might install on this
>>> platform.  If this is specific to a particular platform and board
>>> combination, it might be better to check the platform in the
>>> dpm_init() code for the specific chip that is problematic.
>>> Additionally, disabling dpm will result in boot clocks which means
>>> performance will be very low.
>>>
>>> Alex
>>
>> This motherboard model doesn't have combinations with different
>> platforms or chipsets now.Their model numbers are unique,so it seems
>> unnecessary to add extra judgment.
> 
> The error message looks to be from an SI board which is a dGPU.  Is
> that dGPU integrated into the motherboard?  Does the motherboard have
> PCIe slots?  If there are PCIe slots you could presumably put any GPU
> into it and if you did, dpm would be disabled by default.
> 
> Alex

I would also then question whether the SI board also has failures in 
other systems.  If notit could point at BIOS bugs with the PCIe 
implementation from the MB.

One thing that you can try to do is turn off pcie port pm using the 
kernel command line and see if things improve.  Some motherboards have 
had issues with this in the past.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 094498a0964b..81716fcac7cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -32,6 +32,7 @@ 
 #include <drm/drm_vblank.h>
 
 #include <linux/cc_platform.h>
+#include <linux/dmi.h>
 #include <linux/dynamic_debug.h>
 #include <linux/module.h>
 #include <linux/mmu_notifier.h>
@@ -3023,10 +3024,32 @@  static struct pci_driver amdgpu_kms_pci_driver = {
 	.dev_groups = amdgpu_sysfs_groups,
 };
 
+static int quirk_set_amdgpu_dpm_0(const struct dmi_system_id *dmi)
+{
+	amdgpu_dpm = 0;
+	pr_info("Identified '%s', set amdgpu_dpm to 0.\n", dmi->ident);
+	return 1;
+}
+
+static const struct dmi_system_id amdgpu_quirklist[] = {
+	{
+		.ident = "DS25 Desktop",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "THTF-SW831-1W-DS25_MB"),
+		},
+		.callback = quirk_set_amdgpu_dpm_0,
+	},
+	{}
+};
+
 static int __init amdgpu_init(void)
 {
 	int r;
 
+	/* quirks for some hardware, applied only when it's untouched */
+	if (amdgpu_dpm == -1)
+		dmi_check_system(amdgpu_quirklist);
+
 	if (drm_firmware_drivers_only())
 		return -EINVAL;