diff mbox series

[v7,06/12] EDAC/amd64: Add AMD heterogeneous family 19h Model 30h-3fh

Message ID 20220203174942.31630-7-nchatrad@amd.com (mailing list archive)
State New, archived
Headers show
Series x86/edac/amd64: Add support for GPU nodes | expand

Commit Message

Naveen Krishna Chatradhi Feb. 3, 2022, 5:49 p.m. UTC
From: Muralidhara M K <muralimk@amd.com>

On heterogeneous systems with AMD CPUs, the data fabrics of the GPUs
are connected directly via custom links.

One such system, where Aldebaran GPU nodes are connected to the
Family 19h, model 30h family of CPU nodes.

Aldebaran GPU support was added to DRM framework
https://lists.freedesktop.org/archives/amd-gfx/2021-February/059694.html

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
Link:
https://lkml.kernel.org/r/20211028130106.15701-6-nchatrad@amd.com

v6->v7:
* split the model specific assignments in patch 5 of v6 series
  


 drivers/edac/amd64_edac.c | 14 ++++++++++++++
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 16 insertions(+)

Comments

Yazen Ghannam Feb. 15, 2022, 4:20 p.m. UTC | #1
On Thu, Feb 03, 2022 at 11:49:36AM -0600, Naveen Krishna Chatradhi wrote:
> From: Muralidhara M K <muralimk@amd.com>
> 
> On heterogeneous systems with AMD CPUs, the data fabrics of the GPUs
> are connected directly via custom links.
> 
> One such system, where Aldebaran GPU nodes are connected to the
> Family 19h, model 30h family of CPU nodes.
> 
> Aldebaran GPU support was added to DRM framework
> https://lists.freedesktop.org/archives/amd-gfx/2021-February/059694.html
>

This message doesn't describe the patch.
 
> Signed-off-by: Muralidhara M K <muralimk@amd.com>
> Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> ---
> Link:
> https://lkml.kernel.org/r/20211028130106.15701-6-nchatrad@amd.com
> 
> v6->v7:
> * split the model specific assignments in patch 5 of v6 series
>   
> 
> 
>  drivers/edac/amd64_edac.c | 14 ++++++++++++++
>  drivers/edac/amd64_edac.h |  2 ++
>  2 files changed, 16 insertions(+)
> 
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index babd25f29845..54af7e38d26c 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -4454,6 +4454,19 @@ static void per_family_init(struct amd64_pvt *pvt)
>  			pvt->f0_id			= PCI_DEVICE_ID_AMD_19H_M10H_DF_F0;
>  			pvt->f6_id			= PCI_DEVICE_ID_AMD_19H_M10H_DF_F6;
>  			pvt->max_mcs			= 2;
> +		} else if (pvt->model >= 0x30 && pvt->model <= 0x3f) {
> +			if (pvt->mc_node_id >= amd_nb_num()) {

"ALDEBARAN" is a specific device with unique PCI IDs. So this dependency on
amd_nb.c can be removed.

For example,
			if (pvt->F3->device == PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3)
				...

> +				pvt->ctl_name		= "ALDEBARAN";
> +				pvt->f0_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F0;
> +				pvt->f6_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F6;
> +				pvt->max_mcs		= 4;
> +				goto end_fam;

Why not just "break" here instead of jumping to the break below?

> +			} else {
> +				pvt->ctl_name		= "F19h_M30h";
> +				pvt->f0_id		= PCI_DEVICE_ID_AMD_19H_DF_F0;
> +				pvt->f6_id		= PCI_DEVICE_ID_AMD_19H_DF_F6;
> +				pvt->max_mcs		= 8;
> +			}
>  		} else {
>  			pvt->ctl_name			= "F19h";
>  			pvt->f0_id			= PCI_DEVICE_ID_AMD_19H_DF_F0;
> @@ -4476,6 +4489,7 @@ static void per_family_init(struct amd64_pvt *pvt)
>  		pvt->ops->get_mc_regs			= __read_mc_regs_df;
>  		pvt->ops->populate_csrows		= init_csrows_df;
>  		pvt->ops->get_umc_err_info		= update_umc_err_info;
> + end_fam:
>  		break;
>  
>  	default:

Thanks,
Yazen
diff mbox series

Patch

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index babd25f29845..54af7e38d26c 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -4454,6 +4454,19 @@  static void per_family_init(struct amd64_pvt *pvt)
 			pvt->f0_id			= PCI_DEVICE_ID_AMD_19H_M10H_DF_F0;
 			pvt->f6_id			= PCI_DEVICE_ID_AMD_19H_M10H_DF_F6;
 			pvt->max_mcs			= 2;
+		} else if (pvt->model >= 0x30 && pvt->model <= 0x3f) {
+			if (pvt->mc_node_id >= amd_nb_num()) {
+				pvt->ctl_name		= "ALDEBARAN";
+				pvt->f0_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F0;
+				pvt->f6_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F6;
+				pvt->max_mcs		= 4;
+				goto end_fam;
+			} else {
+				pvt->ctl_name		= "F19h_M30h";
+				pvt->f0_id		= PCI_DEVICE_ID_AMD_19H_DF_F0;
+				pvt->f6_id		= PCI_DEVICE_ID_AMD_19H_DF_F6;
+				pvt->max_mcs		= 8;
+			}
 		} else {
 			pvt->ctl_name			= "F19h";
 			pvt->f0_id			= PCI_DEVICE_ID_AMD_19H_DF_F0;
@@ -4476,6 +4489,7 @@  static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+ end_fam:
 		break;
 
 	default:
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 07ff2c6c17c5..66f7b5d45a37 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -130,6 +130,8 @@ 
 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F0 0x14d0
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F6 0x14d6
 
 /*
  * Function 1 - Address Map