diff mbox series

[1/1] EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids

Message ID 20250214002728.6287-1-qiuxu.zhuo@intel.com (mailing list archive)
State New
Headers show
Series [1/1] EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids | expand

Commit Message

Qiuxu Zhuo Feb. 14, 2025, 12:27 a.m. UTC
When doing error injection to some memory DIMMs on certain Intel Emerald
Rapids servers, the i10nm_edac missed error reports for some memory DIMMs.

Certain BIOS configurations may hide some memory controllers, and the
i10nm_edac doesn't enumerate these hidden memory controllers. However, the
ADXL decodes memory errors using memory controller physical indices even
if there are hidden memory controllers. Therefore, the memory controller
physical indices reported by the ADXL may mismatch the logical indices
enumerated by the i10nm_edac, resulting in missed error reports for some
memory DIMMs.

Fix this issue by creating a mapping table from memory controller physical
indices (used by the ADXL) to logical indices (used by the i10nm_edac) and
using it to convert the physical indices to the logical indices during the
error handling process.

Fixes: c545f5e41225 ("EDAC/i10nm: Skip the absent memory controllers")
Reported-by: Kevin Chang <kevin1.chang@intel.com>
Tested-by: Kevin Chang <kevin1.chang@intel.com>
Reported-by: Thomas Chen <Thomas.Chen@intel.com>
Tested-by: Thomas Chen <Thomas.Chen@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
 drivers/edac/i10nm_base.c |  2 ++
 drivers/edac/skx_common.c | 33 +++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h | 11 +++++++++++
 3 files changed, 46 insertions(+)

Comments

Luck, Tony Feb. 21, 2025, 1:15 a.m. UTC | #1
On Fri, Feb 14, 2025 at 08:27:28AM +0800, Qiuxu Zhuo wrote:
> When doing error injection to some memory DIMMs on certain Intel Emerald
> Rapids servers, the i10nm_edac missed error reports for some memory DIMMs.
> 
> Certain BIOS configurations may hide some memory controllers, and the
> i10nm_edac doesn't enumerate these hidden memory controllers. However, the
> ADXL decodes memory errors using memory controller physical indices even
> if there are hidden memory controllers. Therefore, the memory controller
> physical indices reported by the ADXL may mismatch the logical indices
> enumerated by the i10nm_edac, resulting in missed error reports for some
> memory DIMMs.
> 
> Fix this issue by creating a mapping table from memory controller physical
> indices (used by the ADXL) to logical indices (used by the i10nm_edac) and
> using it to convert the physical indices to the logical indices during the
> error handling process.
> 
> Fixes: c545f5e41225 ("EDAC/i10nm: Skip the absent memory controllers")
> Reported-by: Kevin Chang <kevin1.chang@intel.com>
> Tested-by: Kevin Chang <kevin1.chang@intel.com>
> Reported-by: Thomas Chen <Thomas.Chen@intel.com>
> Tested-by: Thomas Chen <Thomas.Chen@intel.com>
> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Applied to RAS tree edac-drivers branch

Thanks

-Tony
diff mbox series

Patch

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index f45d849d3f15..355a977019e9 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -751,6 +751,8 @@  static int i10nm_get_ddr_munits(void)
 				continue;
 			} else {
 				d->imc[lmc].mdev = mdev;
+				if (res_cfg->type == SPR)
+					skx_set_mc_mapping(d, i, lmc);
 				lmc++;
 			}
 		}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index f7bd930e058f..fa5b442b1844 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -121,6 +121,35 @@  void skx_adxl_put(void)
 }
 EXPORT_SYMBOL_GPL(skx_adxl_put);
 
+static void skx_init_mc_mapping(struct skx_dev *d)
+{
+	/*
+	 * By default, the BIOS presents all memory controllers within each
+	 * socket to the EDAC driver. The physical indices are the same as
+	 * the logical indices of the memory controllers enumerated by the
+	 * EDAC driver.
+	 */
+	for (int i = 0; i < NUM_IMC; i++)
+		d->mc_mapping[i] = i;
+}
+
+void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
+{
+	edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+		 pmc, lmc);
+
+	d->mc_mapping[pmc] = lmc;
+}
+EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
+
+static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
+{
+	edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+		 pmc, d->mc_mapping[pmc]);
+
+	return d->mc_mapping[pmc];
+}
+
 static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 {
 	struct skx_dev *d;
@@ -188,6 +217,8 @@  static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 		return false;
 	}
 
+	res->imc = skx_get_mc_mapping(d, res->imc);
+
 	for (i = 0; i < adxl_component_count; i++) {
 		if (adxl_values[i] == ~0x0ull)
 			continue;
@@ -326,6 +357,8 @@  int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
 			 d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
 		list_add_tail(&d->list, &dev_edac_list);
 		prev = pdev;
+
+		skx_init_mc_mapping(d);
 	}
 
 	if (list)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index b0845bdd4516..ca5408803f87 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -93,6 +93,16 @@  struct skx_dev {
 	struct pci_dev *uracu; /* for i10nm CPU */
 	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
+	/*
+	 * Some server BIOS may hide certain memory controllers, and the
+	 * EDAC driver skips those hidden memory controllers. However, the
+	 * ADXL still decodes memory error address using physical memory
+	 * controller indices. The mapping table is used to convert the
+	 * physical indices (reported by ADXL) to the logical indices
+	 * (used the EDAC driver) of present memory controllers during the
+	 * error handling process.
+	 */
+	u8 mc_mapping[NUM_IMC];
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
@@ -242,6 +252,7 @@  void skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
 void skx_set_mem_cfg(bool mem_cfg_2lm);
 void skx_set_res_cfg(struct res_config *cfg);
+void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);