diff mbox series

EDAC/versal: Report the bit position of the error

Message ID 20240521115959.17596-1-shubhrajyoti.datta@amd.com (mailing list archive)
State New
Headers show
Series EDAC/versal: Report the bit position of the error | expand

Commit Message

Shubhrajyoti Datta May 21, 2024, 11:59 a.m. UTC
In the case of CE, the bit position can be derived from the data
and the parity registers. The values are read and
then from the fixed H Matrix the corrected bit
position is derived. The hardware already corrects the
CE errors however it is derived from the registers and
reported for debug purposes.

When a correctable error is detected, the driver reads from the
registers:

eccr0/eccr1_corr_err_data_lo
eccr0/eccr1_corr_err_data_hi
eccr0/eccr1_corr_err_data_par

and along with shifting/XORing fixed H_Matrix ECC values,
calculates which bit location is found, mimicking the calculation
done in the DDRMC ECC hw.

The H_Matrix was generated from an old 1950's IBM paper.

The maximum bits in the data read is 64 so the data match is run for
64 bits.

Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
---

 drivers/edac/versal_edac.c | 84 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

Comments

Borislav Petkov June 12, 2024, 4:32 p.m. UTC | #1
On Tue, May 21, 2024 at 05:29:59PM +0530, Shubhrajyoti Datta wrote:
> In the case of CE, the bit position can be derived from the data
> and the parity registers. The values are read and
> then from the fixed H Matrix the corrected bit
> position is derived. The hardware already corrects the
> CE errors however it is derived from the registers and
> reported for debug purposes.
> 
> When a correctable error is detected, the driver reads from the
> registers:
> 
> eccr0/eccr1_corr_err_data_lo
> eccr0/eccr1_corr_err_data_hi
> eccr0/eccr1_corr_err_data_par
> 
> and along with shifting/XORing fixed H_Matrix ECC values,
> calculates which bit location is found, mimicking the calculation
> done in the DDRMC ECC hw.
> 
> The H_Matrix was generated from an old 1950's IBM paper.
> 
> The maximum bits in the data read is 64 so the data match is run for
> 64 bits.

This commit message is "all over the place". After reading it, I still don't
know "why" this patch even exists.

The goal for our commit messages is to be as clear to humans as possible, even
for people who do not have intimate knowledge of the matter.

And, more importantly, when we start doing git archeology months, years from
now, it should be perfectly clear why a commit was done.

A possible way to structure is:

1. Prepare the context for the explanation briefly.

2. Explain the problem at hand.

3. "It happens because of <...>"

4. "Fix it by doing X"

5. "(Potentially do Y)."

And some of those above are optional depending on the issue being explained.

For more detailed info, see
Documentation/process/submitting-patches.rst,
Section "2) Describe your changes".

Also, do not talk about what your patch does - that should (hopefully) be
visible from the diff itself. Rather, talk about *why* you're doing what
you're doing and about the non-obvious aspects of a code change.

So pls give it another try.

Thx.
diff mbox series

Patch

diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
index a556d23e8261..f02df3cd0faa 100644
--- a/drivers/edac/versal_edac.c
+++ b/drivers/edac/versal_edac.c
@@ -144,6 +144,8 @@ 
 #define XILINX_DRAM_SIZE_16G			4
 #define XILINX_DRAM_SIZE_32G			5
 #define NUM_UE_BITPOS				2
+#define NUM_ECC_BITS				72
+#define DATA_BITS				64
 
 /**
  * struct ecc_error_info - ECC error log information.
@@ -245,6 +247,79 @@  struct edac_priv {
 #endif
 };
 
+static int rowa[] = {
+	0x7, 0xB, 0x13, 0x23, 0x43, 0x83, 0xD, 0x15,
+	0x25, 0x45, 0x85, 0x19, 0x29, 0x49, 0x89, 0x31,
+	0x51, 0x91, 0x61, 0xa1, 0xc1, 0x0e, 0x16, 0x26,
+	0x46, 0x86, 0x1a, 0x2a, 0x4a, 0x8a, 0x32, 0x52,
+	0x92, 0x62, 0xa2, 0xc2, 0x1c, 0x2c, 0x4c, 0x8c,
+	0x34, 0x54, 0x94, 0x64, 0xa4, 0xc4, 0x38, 0x58,
+	0x98, 0x68, 0xa8, 0xc8, 0x70, 0xB0, 0xD0, 0xE0,
+	0x1f, 0x2f, 0x4f, 0x8f, 0x37, 0x57, 0x97, 0x67,
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+/**
+ * calculate_ce_bit_value - Report the bit that was corrupted in the CE errors
+ *
+ * @eccr0_corr_err_data_hi:	ECC error data hi register
+ * @regval:	ECC error data lo register.
+ * @parity:	ECC error parity
+ *
+ * Return: none.
+ * Report the bit that was corrupted in the CE errors
+ *
+ * The function takes the data and the parity as inputs.
+ * It derive the corrupted bit from a fixed H Matrix and the data
+ * and parity.
+ */
+static void calculate_ce_bit_value(u32 eccr0_corr_err_data_hi, u32 regval, u32 parity)
+{
+	u8 bit, xor[BITS_PER_BYTE], match = 0;
+	u8 rows[NUM_ECC_BITS] = {0};
+	bool field, prev;
+	u64 data;
+	int i, j;
+
+	data = (u64)eccr0_corr_err_data_hi << 32;
+	data |= regval;
+
+	for (i = 0 ; i < DATA_BITS; i++) {
+		if (data & BIT(i))
+			rows[i] = rowa[i];
+	}
+
+	for (i = DATA_BITS ; i < NUM_ECC_BITS; i++) {
+		j = i - DATA_BITS;
+		if (parity & BIT(j))
+			rows[i] = rowa[i];
+	}
+
+	for (j = 0 ; j < BITS_PER_BYTE; j++) {
+		bit = rows[0] & BIT(j);
+		prev = !!bit;
+
+		for (i = 1 ; i < NUM_ECC_BITS; i++) {
+			bit = rows[i] & BIT(j);
+			field = !!bit;
+			if (field != prev)
+				prev = true;
+			else
+				prev = false;
+		}
+		xor[j] = prev;
+	}
+
+	for (j = 0 ; j < BITS_PER_BYTE; j++)
+		match |= xor[j] << j;
+
+	for (i = 0 ; i < NUM_ECC_BITS; i++) {
+		if (match == rowa[i]) {
+			edac_dbg(2, "bit found is  0x%08X\n", i);
+			return;
+		}
+	}
+}
+
 static void get_ce_error_info(struct edac_priv *priv)
 {
 	void __iomem *ddrmc_base;
@@ -271,6 +346,15 @@  static void get_ce_error_info(struct edac_priv *priv)
 	p->ceinfo[1].i = regval | reghi << 32;
 	regval = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
 
+	if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+		u32 eccr0_corr_err_data_hi, par;
+
+		par = readl(ddrmc_base + ECCR0_CE_DATA_PAR_OFFSET);
+		regval = readl(ddrmc_base + ECCR0_CE_DATA_LO_OFFSET);
+		eccr0_corr_err_data_hi =  readl(ddrmc_base + ECCR0_CE_DATA_HI_OFFSET);
+		calculate_ce_bit_value(eccr0_corr_err_data_hi, regval, par);
+	}
+
 	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
 		 readl(ddrmc_base + ECCR1_CE_DATA_LO_OFFSET),
 		 readl(ddrmc_base + ECCR1_CE_DATA_HI_OFFSET),