diff mbox series

[1/1] EDAC/i10nm: Skip the absent memory controllers

Message ID 20230706134216.37044-1-qiuxu.zhuo@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/1] EDAC/i10nm: Skip the absent memory controllers | expand

Commit Message

Zhuo, Qiuxu July 6, 2023, 1:42 p.m. UTC
Some Sapphire Rapids workstations' absent memory controllers
still appear as PCIe devices that fool the i10nm_edac driver
and result in "shift exponet -66 is negative" call traces
from skx_get_dimm_info().

Skip the absent memory controllers to avoid the call traces.

Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
Reported-by: Koba Ko <koba.ko@canonical.com>
Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
 drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

Comments

Kai-Heng Feng July 7, 2023, 5:30 a.m. UTC | #1
On Thu, Jul 6, 2023 at 9:46 PM Qiuxu Zhuo <qiuxu.zhuo@intel.com> wrote:
>
> Some Sapphire Rapids workstations' absent memory controllers
> still appear as PCIe devices that fool the i10nm_edac driver
> and result in "shift exponet -66 is negative" call traces
> from skx_get_dimm_info().
>
> Skip the absent memory controllers to avoid the call traces.
>
> Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
> Reported-by: Koba Ko <koba.ko@canonical.com>
> Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
> Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>

> ---
>  drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++----
>  1 file changed, 49 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> index a897b6aff368..349ff6cfb379 100644
> --- a/drivers/edac/i10nm_base.c
> +++ b/drivers/edac/i10nm_base.c
> @@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
>         return mdev;
>  }
>
> +/**
> + * i10nm_imc_absent() - Check whether the memory controller @imc is absent
> + *
> + * @imc    : The pointer to the structure of memory controller EDAC device.
> + *
> + * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
> + */
> +static bool i10nm_imc_absent(struct skx_imc *imc)
> +{
> +       u32 mcmtr;
> +       int i;
> +
> +       switch (res_cfg->type) {
> +       case SPR:
> +               for (i = 0; i < res_cfg->ddr_chan_num; i++) {
> +                       mcmtr = I10NM_GET_MCMTR(imc, i);
> +                       edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
> +                       if (mcmtr != ~0)
> +                               return false;
> +               }
> +
> +               /*
> +                * Some workstations' absent memory controllers still
> +                * appear as PCIe devices, misleading the EDAC driver.
> +                * By observing that the MMIO registers of these absent
> +                * memory controllers consistently hold the value of ~0.
> +                *
> +                * We identify a memory controller as absent by checking
> +                * if its MMIO register "mcmtr" == ~0 in all its channels.
> +                */
> +               return true;
> +       default:
> +               return false;
> +       }
> +}
> +
>  static int i10nm_get_ddr_munits(void)
>  {
>         struct pci_dev *mdev;
>         void __iomem *mbase;
>         unsigned long size;
>         struct skx_dev *d;
> -       int i, j = 0;
> +       int i, lmc, j = 0;
>         u32 reg, off;
>         u64 base;
>
> @@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
>                 edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
>                          j++, base, reg);
>
> -               for (i = 0; i < res_cfg->ddr_imc_num; i++) {
> +               for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
>                         mdev = get_ddr_munit(d, i, &off, &size);
>
>                         if (i == 0 && !mdev) {
> @@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
>                         if (!mdev)
>                                 continue;
>
> -                       d->imc[i].mdev = mdev;
> -
>                         edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
>                                  i, base + off, size, reg);
>
> @@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
>                                 return -ENODEV;
>                         }
>
> -                       d->imc[i].mbase = mbase;
> +                       d->imc[lmc].mbase = mbase;
> +                       if (i10nm_imc_absent(&d->imc[lmc])) {
> +                               pci_dev_put(mdev);
> +                               iounmap(mbase);
> +                               d->imc[lmc].mbase = NULL;
> +                               edac_dbg(2, "Skip absent mc%d\n", i);
> +                               continue;
> +                       } else {
> +                               d->imc[lmc].mdev = mdev;
> +                               lmc++;
> +                       }
>                 }
>         }
>
> --
> 2.17.1
>
Zhuo, Qiuxu July 7, 2023, 6:38 a.m. UTC | #2
> From: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ...
> Subject: Re: [PATCH 1/1] EDAC/i10nm: Skip the absent memory controllers
> 
> On Thu, Jul 6, 2023 at 9:46 PM Qiuxu Zhuo <qiuxu.zhuo@intel.com> wrote:
> >
> > Some Sapphire Rapids workstations' absent memory controllers still
> > appear as PCIe devices that fool the i10nm_edac driver and result in
> > "shift exponet -66 is negative" call traces from skx_get_dimm_info().
> >
> > Skip the absent memory controllers to avoid the call traces.
> >
> > Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ...
> Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>

Thanks for the testing feedback.

-Qiuxu
diff mbox series

Patch

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index a897b6aff368..349ff6cfb379 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -658,13 +658,49 @@  static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
 	return mdev;
 }
 
+/**
+ * i10nm_imc_absent() - Check whether the memory controller @imc is absent
+ *
+ * @imc    : The pointer to the structure of memory controller EDAC device.
+ *
+ * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
+ */
+static bool i10nm_imc_absent(struct skx_imc *imc)
+{
+	u32 mcmtr;
+	int i;
+
+	switch (res_cfg->type) {
+	case SPR:
+		for (i = 0; i < res_cfg->ddr_chan_num; i++) {
+			mcmtr = I10NM_GET_MCMTR(imc, i);
+			edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
+			if (mcmtr != ~0)
+				return false;
+		}
+
+		/*
+		 * Some workstations' absent memory controllers still
+		 * appear as PCIe devices, misleading the EDAC driver.
+		 * By observing that the MMIO registers of these absent
+		 * memory controllers consistently hold the value of ~0.
+		 *
+		 * We identify a memory controller as absent by checking
+		 * if its MMIO register "mcmtr" == ~0 in all its channels.
+		 */
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
 	unsigned long size;
 	struct skx_dev *d;
-	int i, j = 0;
+	int i, lmc, j = 0;
 	u32 reg, off;
 	u64 base;
 
@@ -690,7 +726,7 @@  static int i10nm_get_ddr_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < res_cfg->ddr_imc_num; i++) {
+		for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
 			mdev = get_ddr_munit(d, i, &off, &size);
 
 			if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@  static int i10nm_get_ddr_munits(void)
 			if (!mdev)
 				continue;
 
-			d->imc[i].mdev = mdev;
-
 			edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
 				 i, base + off, size, reg);
 
@@ -712,7 +746,17 @@  static int i10nm_get_ddr_munits(void)
 				return -ENODEV;
 			}
 
-			d->imc[i].mbase = mbase;
+			d->imc[lmc].mbase = mbase;
+			if (i10nm_imc_absent(&d->imc[lmc])) {
+				pci_dev_put(mdev);
+				iounmap(mbase);
+				d->imc[lmc].mbase = NULL;
+				edac_dbg(2, "Skip absent mc%d\n", i);
+				continue;
+			} else {
+				d->imc[lmc].mdev = mdev;
+				lmc++;
+			}
 		}
 	}