diff mbox series

[v7,08/12] EDAC/amd64: Add Family ops to update GPU csrow and channel info

Message ID 20220203174942.31630-9-nchatrad@amd.com (mailing list archive)
State New, archived
Headers show
Series x86/edac/amd64: Add support for GPU nodes | expand

Commit Message

Naveen Krishna Chatradhi Feb. 3, 2022, 5:49 p.m. UTC
GPU node has 'X' number of PHYs and 'Y' number of channels.
This results in 'X*Y' number of instances in the Data Fabric.
Therefore the Data Fabric ID of an instance in GPU as below:
  df_inst_id = 'X' * number of channels per PHY + 'Y'

On CPUs the Data Fabric ID of an instance on a CPU is equal to the
UMC number. since the UMC number and channel are equal in CPU nodes,
the channel can be used as the Data Fabric ID of the instance.

Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Co-developed-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
v1->v7:
* New change in v7

 drivers/edac/amd64_edac.c | 60 +++++++++++++++++++++++++++++++++++++--
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 60 insertions(+), 2 deletions(-)

Comments

Yazen Ghannam Feb. 15, 2022, 4:43 p.m. UTC | #1
On Thu, Feb 03, 2022 at 11:49:38AM -0600, Naveen Krishna Chatradhi wrote:
> GPU node has 'X' number of PHYs and 'Y' number of channels.
> This results in 'X*Y' number of instances in the Data Fabric.
> Therefore the Data Fabric ID of an instance in GPU as below:
>   df_inst_id = 'X' * number of channels per PHY + 'Y'
> 
> On CPUs the Data Fabric ID of an instance on a CPU is equal to the
> UMC number. since the UMC number and channel are equal in CPU nodes,
> the channel can be used as the Data Fabric ID of the instance.
> 
> Cc: Yazen Ghannam <yazen.ghannam@amd.com>
> Co-developed-by: Muralidhara M K <muralimk@amd.com>
> Signed-off-by: Muralidhara M K <muralimk@amd.com>
> Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> ---
> v1->v7:
> * New change in v7
> 
>  drivers/edac/amd64_edac.c | 60 +++++++++++++++++++++++++++++++++++++--
>  drivers/edac/amd64_edac.h |  2 ++
>  2 files changed, 60 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 10efe726a959..241419a0be93 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -3653,6 +3653,30 @@ static inline void decode_bus_error(int node_id, struct mce *m)
>  	__log_ecc_error(mci, &err, ecc_type);
>  }
>  
> +/*
> + * On CPUs, The Data Fabric ID of an instance is equal to the UMC number.
> + * And since the UMC number and channel are equal in CPU nodes, the channel can be used
> + * as the Data Fabric ID of the instance.
> + */
> +static int f17_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
> +			  struct err_info *err)
> +{
> +	return err->channel;
> +}
> +
> +/*
> + * A GPU node has 'X' number of PHYs and 'Y' number of channels.
> + * This results in 'X*Y' number of instances in the Data Fabric.
> + * Therefore the Data Fabric ID of an instance can be found with the following formula:
> + * df_inst_id = 'X' * number of channels per PHY + 'Y'
> + *
> + */
> +static int gpu_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
> +			  struct err_info *err)
> +{
> +	return (err->csrow * pvt->channel_count / mci->nr_csrows) + err->channel;
> +}
> +

The DF Instance ID needs to get adjusted again later in the translation code
due to the fixed mapping of CSes to UMCs. Can that be done here instead? Also,
I assume that fixed mapping is unique to each product, so that would make it a
good fit for the family/pvt ops.

Thanks,
Yazen
diff mbox series

Patch

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 10efe726a959..241419a0be93 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3653,6 +3653,30 @@  static inline void decode_bus_error(int node_id, struct mce *m)
 	__log_ecc_error(mci, &err, ecc_type);
 }
 
+/*
+ * On CPUs, The Data Fabric ID of an instance is equal to the UMC number.
+ * And since the UMC number and channel are equal in CPU nodes, the channel can be used
+ * as the Data Fabric ID of the instance.
+ */
+static int f17_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return err->channel;
+}
+
+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the Data Fabric.
+ * Therefore the Data Fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return (err->csrow * pvt->channel_count / mci->nr_csrows) + err->channel;
+}
+
 /*
  * To find the UMC channel represented by this bank we need to match on its
  * instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3670,12 +3694,38 @@  static void update_umc_err_info(struct mce *m, struct err_info *err)
 	err->csrow = m->synd & 0x7;
 }
 
+/*
+ * The CPUs have one channel per UMC, So  UMC number is equivalent to a
+ * channel number. The GPUs have 8 channels per UMC, so the UMC number no
+ * longer works as a channel number.
+ * The channel number within a GPU UMC is given in MCA_IPID[15:12].
+ * However, the IDs are split such that two UMC values go to one UMC, and
+ * the channel numbers are split in two groups of four.
+ *
+ * Refer comment on get_umc_base_gpu() from amd64_edac.h
+ *
+ * For example,
+ * UMC0 CH[3:0] = 0x0005[3:0]000
+ * UMC0 CH[7:4] = 0x0015[3:0]000
+ * UMC1 CH[3:0] = 0x0025[3:0]000
+ * UMC1 CH[7:4] = 0x0035[3:0]000
+ */
+static void gpu_update_umc_err_info(struct mce *m, struct err_info *err)
+{
+	u8 ch = (m->ipid & GENMASK(31, 0)) >> 20;
+	u8 phy = ((m->ipid >> 12) & 0xf);
+
+	err->channel = ch % 2 ? phy + 4 : phy;
+	err->csrow = phy;
+}
+
 static void decode_umc_error(int node_id, struct mce *m)
 {
 	u8 ecc_type = (m->status >> 45) & 0x3;
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
 	struct err_info err;
+	u8 df_inst_id;
 	u64 sys_addr;
 
 	mci = edac_mc_find(node_id);
@@ -3705,7 +3755,9 @@  static void decode_umc_error(int node_id, struct mce *m)
 
 	pvt->ops->get_umc_err_info(m, &err);
 
-	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+	df_inst_id = pvt->ops->find_umc_inst_id(mci, pvt, &err);
+
+	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
 		err.err_code = ERR_NORM_ADDR;
 		goto log_error;
 	}
@@ -4625,6 +4677,7 @@  static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
 
 		if (pvt->fam == 0x18) {
 			pvt->ctl_name			= "F18h";
@@ -4678,6 +4731,8 @@  static void per_family_init(struct amd64_pvt *pvt)
 				pvt->ops->dump_misc_regs	= gpu_dump_misc_regs;
 				pvt->ops->get_mc_regs		= gpu_read_mc_regs;
 				pvt->ops->populate_csrows	= gpu_init_csrows;
+				pvt->ops->get_umc_err_info	= gpu_update_umc_err_info;
+				pvt->ops->find_umc_inst_id	= gpu_df_inst_id;
 				goto end_fam;
 			} else {
 				pvt->ctl_name		= "F19h_M30h";
@@ -4707,6 +4762,7 @@  static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
  end_fam:
 		break;
 
@@ -4728,7 +4784,7 @@  static void per_family_init(struct amd64_pvt *pvt)
 	}
 
 	/* ops required for families 17h and later */
-	if (pvt->fam >= 0x17 && !pvt->ops->get_umc_err_info) {
+	if (pvt->fam >= 0x17 && (!pvt->ops->get_umc_err_info || !pvt->ops->find_umc_inst_id)) {
 		edac_dbg(1, "Platform specific helper routines not defined.\n");
 		return;
 	}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 71df08a496d2..b6177bd5d5ba 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -496,6 +496,8 @@  struct low_ops {
 	void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
 	int  (*populate_csrows)(struct mem_ctl_info *mci);
 	void (*get_umc_err_info)(struct mce *m, struct err_info *err);
+	int  (*find_umc_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+				 struct err_info *err);
 };
 
 int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,