Message ID | 20230720125425.3735538-8-muralimk@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | AMD Family 19h Models 90h-9fh EDAC Support | expand |
On 7/20/2023 8:54 AM, Muralidhara M K wrote: > From: Muralidhara M K <muralidhara.mk@amd.com> > > Reported MCA address is DRAM address which needs to be converted > to normalized address before Data fabric address translation. > > Some AMD systems have on-chip memory capable of OnDie ECC support. > OnDie-ECC error address to MCA is a DRAM decoded address reported with > a DRAM address (PC/SID/Bank/ROW/COL) instead of normalized address > unlike MI200’s UMC ECC, as the implementation difference between > HBM3 ODECC and HBM2 host ECC. > Because OnDie-ECC address reporting is done in the back-end of UMC and > it no longer has normalized address at that point. > So software needs to convert the reported MCA Error Address back to > normalized address. > > Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> > --- > drivers/edac/amd64_edac.c | 160 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 160 insertions(+) > > diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c > index 74b2b47cc22a..304d104c25d8 100644 > --- a/drivers/edac/amd64_edac.c > +++ b/drivers/edac/amd64_edac.c > @@ -3076,6 +3076,159 @@ static void umc_get_err_info(struct mce *m, struct err_info *err) > err->csrow = m->synd & 0x7; > } > > +static bool internal_bit_wise_xor(u32 inp) > +{ > + bool tmp = 0; > + > + for (int i = 0; i < 32; i++) > + tmp = tmp ^ ((inp >> i) & 0x1); > + > + return tmp; > +} > + > +/* mapping of MCA error address to normalized address */ > +static const u8 umc_mca2na_mapping[] = { > + 0, 5, 6, 8, 9, 14, 12, 13, > + 10, 11, 15, 16, 17, 18, 19, 20, > + 21, 22, 23, 24, 25, 26, 27, 28, > + 7, 29, 30, > +}; > + > +/* > + * Read AMD PPR UMC::AddrHashBank and > + * UMC::CH::AddrHashPC/PC2 register fields > + */ > +static struct { > + u32 xor_enable :1; > + u32 col_xor :13; > + u32 row_xor :18; > +} addr_hash_pc, addr_hash_bank[4]; > + > +static struct { > + u32 bank_xor :6; > +} addr_hash_pc2; > + > +/* > + * The location of bank, column and row are fixed. > + * location of column bit must be NA[5]. > + * Row bits are always placed in a contiguous stretch of NA above the > + * column and bank bits. > + * Bits below the row bits can be either column or bank in any order, > + * with the exception that NA[5] must be a column bit. > + * Stack ID(SID) bits are placed in the MSB position of the NA. > + */ > +static int umc_ondie_addr_to_normaddr(u64 mca_addr, u16 nid) > +{ > + u32 bank[4], bank_hash[4], pc_hash; > + u32 col, row, rawbank = 0, pc; > + int i, temp = 0; > + u64 mca2na; > + > + u32 gpu_umc_base = 0x90000; > + > + /* > + * the below calculation, trying to maps ondie error address > + * to normalized address. logged ondie MCA address format is > + * BEQ_MCA_RdDatAddr[27:0] = > + * {SID[1:0],PC[0],row[14:0],bank[3:0],col[4:0],1'b0} > + * The conversion mappings are: > + * > + * Normalized location ondie MCA error Address > + * =================== ====================== > + * NA[4] = 1'b0 > + * NA[5] = col[0] = BEQ_MCA_RdDatAddr[1] > + * NA[6] = col[1] = BEQ_MCA_RdDatAddr[2] > + * NA[8] = col[2] = BEQ_MCA_RdDatAddr[3] > + * NA[9] = col[3] = BEQ_MCA_RdDatAddr[4] > + * NA[14] = col[4] = BEQ_MCA_RdDatAddr[5] > + * NA[12] = bank[0] = BEQ_MCA_RdDatAddr[5] > + * NA[13] = bank[1] = BEQ_MCA_RdDatAddr[6] > + * NA[10] = bank[2] = BEQ_MCA_RdDatAddr[7] > + * NA[11] = bank[3] = BEQ_MCA_RdDatAddr[8] > + * > + * row low is 12 bit locations, low lsb bit starts from 10 > + * NA[15..26] = row[0..11] = BEQ_MCA_RdDatAddr[10..21] > + * > + * row high is 2 bit locations, high lsb bit starts from 22 > + * NA[27..28] = row[12..13] = BEQ_MCA_RdDatAddr[22..23] > + * > + * NA[7] = PC[0] = BEQ_MCA_RdDatAddr[25] > + * NA[29] = sid[0] = bank[4] = BEQ_MCA_RdDatAddr[26] > + * NA[30] = sid[1] = bank[5] = BEQ_MCA_RdDatAddr[27] > + * Basically, it calculates a locations to fit as shown in > + * table umc_mca2na_mapping[]. > + * > + * XORs need to be applied based on the hash settings below. > + */ > + > + /* Calculate column and row */ > + col = FIELD_GET(GENMASK(5, 1), mca_addr); > + row = FIELD_GET(GENMASK(23, 10), mca_addr); > + > + /* Apply hashing on below banks for bank calculation */ > + for (i = 0; i < 4; i++) > + bank_hash[i] = (mca_addr >> (6 + i)) & 0x1; > + > + /* bank hash algorithm */ > + for (i = 0; i < 4; i++) { > + /* Read AMD PPR UMC::AddrHashBank register*/ > + if (!amd_smn_read(nid, gpu_umc_base + 0xC8 + (i * 4), &temp)) { > + addr_hash_bank[i].xor_enable = temp & 1; > + addr_hash_bank[i].col_xor = FIELD_GET(GENMASK(13, 1), temp); > + addr_hash_bank[i].row_xor = FIELD_GET(GENMASK(31, 14), temp); > + /* bank hash selection */ > + bank[i] = bank_hash[i] ^ (addr_hash_bank[i].xor_enable & > + (internal_bit_wise_xor(col & addr_hash_bank[i].col_xor) ^ > + internal_bit_wise_xor(row & addr_hash_bank[i].row_xor))); > + } > + } > + > + /* To apply hash on pc bit */ > + pc_hash = (mca_addr >> 25) & 0x1; > + > + /* Read AMD PPR UMC::CH::AddrHashPC register */ > + if (!amd_smn_read(nid, gpu_umc_base + 0xE0, &temp)) { > + addr_hash_pc.xor_enable = temp & 1; > + addr_hash_pc.col_xor = FIELD_GET(GENMASK(13, 1), temp); > + addr_hash_pc.row_xor = FIELD_GET(GENMASK(31, 14), temp); > + } > + /* Read AMD PPR UMC::CH::AddrHashPC2 register*/ > + if (!amd_smn_read(nid, gpu_umc_base + 0xE4, &temp)) > + addr_hash_pc2.bank_xor = FIELD_GET(GENMASK(5, 0), temp); > + > + /* Calculate bank value from bank[0..3], bank[4] and bank[5] */ > + for (i = 0; i < 4; i++) > + rawbank |= (bank[i] & 1) << i; > + > + rawbank |= (mca_addr >> 22) & 0x30; > + > + /* pseudochannel(pc) hash selection */ > + pc = pc_hash ^ (addr_hash_pc.xor_enable & > + (internal_bit_wise_xor(col & addr_hash_pc.col_xor) ^ > + internal_bit_wise_xor(row & addr_hash_pc.row_xor) ^ > + internal_bit_wise_xor(rawbank & addr_hash_pc2.bank_xor))); > + > + /* Mask b'25(pc_bit) and b'[9:6](bank) */ > + mca_addr &= ~0x20003c0ULL; > + > + for (i = 0; i < 4; i++) > + mca_addr |= (bank[i] << (6 + i)); > + > + mca_addr |= (pc << 25); > + > + /* NA[4..0] is fixed */ > + mca2na = 0x0; > + /* convert mca error address to normalized address */ > + for (i = 1; i < ARRAY_SIZE(umc_mca2na_mapping); i++) > + mca2na |= ((mca_addr >> i) & 0x1) << umc_mca2na_mapping[i]; > + > + mca_addr = mca2na; > + pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", mca_addr); > + pr_emerg(HW_ERR "Error hit on Bank: %d Row: %d Column: %d\n", rawbank, row, col); > + > + return mca_addr; > +} > + > static void decode_umc_error(int node_id, struct mce *m) > { > u8 ecc_type = (m->status >> 45) & 0x3; > @@ -3115,6 +3268,13 @@ static void decode_umc_error(int node_id, struct mce *m) > pvt->ops->get_err_info(m, &err); > df_inst_id = pvt->ops->get_inst_id(mci, pvt, &err); > > + /* > + * The reported MCA address(Error Addr) is DRAM decoded address which needs to be > + * converted to normalized address before DF address translation. > + */ > + if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f)) > + m->addr = umc_ondie_addr_to_normaddr(m->addr, pvt->mc_node_id); > + > if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) { > err.err_code = ERR_NORM_ADDR; > goto log_error; Same comment as previous patch. Leave this until address translation updates. Furthermore, I'm not sure if overwriting m->addr is still a good idea, since we'd like to keep the original error information for other uses. Thanks, Yazen
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 74b2b47cc22a..304d104c25d8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3076,6 +3076,159 @@ static void umc_get_err_info(struct mce *m, struct err_info *err) err->csrow = m->synd & 0x7; } +static bool internal_bit_wise_xor(u32 inp) +{ + bool tmp = 0; + + for (int i = 0; i < 32; i++) + tmp = tmp ^ ((inp >> i) & 0x1); + + return tmp; +} + +/* mapping of MCA error address to normalized address */ +static const u8 umc_mca2na_mapping[] = { + 0, 5, 6, 8, 9, 14, 12, 13, + 10, 11, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, + 7, 29, 30, +}; + +/* + * Read AMD PPR UMC::AddrHashBank and + * UMC::CH::AddrHashPC/PC2 register fields + */ +static struct { + u32 xor_enable :1; + u32 col_xor :13; + u32 row_xor :18; +} addr_hash_pc, addr_hash_bank[4]; + +static struct { + u32 bank_xor :6; +} addr_hash_pc2; + +/* + * The location of bank, column and row are fixed. + * location of column bit must be NA[5]. + * Row bits are always placed in a contiguous stretch of NA above the + * column and bank bits. + * Bits below the row bits can be either column or bank in any order, + * with the exception that NA[5] must be a column bit. + * Stack ID(SID) bits are placed in the MSB position of the NA. + */ +static int umc_ondie_addr_to_normaddr(u64 mca_addr, u16 nid) +{ + u32 bank[4], bank_hash[4], pc_hash; + u32 col, row, rawbank = 0, pc; + int i, temp = 0; + u64 mca2na; + + u32 gpu_umc_base = 0x90000; + + /* + * the below calculation, trying to maps ondie error address + * to normalized address. logged ondie MCA address format is + * BEQ_MCA_RdDatAddr[27:0] = + * {SID[1:0],PC[0],row[14:0],bank[3:0],col[4:0],1'b0} + * The conversion mappings are: + * + * Normalized location ondie MCA error Address + * =================== ====================== + * NA[4] = 1'b0 + * NA[5] = col[0] = BEQ_MCA_RdDatAddr[1] + * NA[6] = col[1] = BEQ_MCA_RdDatAddr[2] + * NA[8] = col[2] = BEQ_MCA_RdDatAddr[3] + * NA[9] = col[3] = BEQ_MCA_RdDatAddr[4] + * NA[14] = col[4] = BEQ_MCA_RdDatAddr[5] + * NA[12] = bank[0] = BEQ_MCA_RdDatAddr[5] + * NA[13] = bank[1] = BEQ_MCA_RdDatAddr[6] + * NA[10] = bank[2] = BEQ_MCA_RdDatAddr[7] + * NA[11] = bank[3] = BEQ_MCA_RdDatAddr[8] + * + * row low is 12 bit locations, low lsb bit starts from 10 + * NA[15..26] = row[0..11] = BEQ_MCA_RdDatAddr[10..21] + * + * row high is 2 bit locations, high lsb bit starts from 22 + * NA[27..28] = row[12..13] = BEQ_MCA_RdDatAddr[22..23] + * + * NA[7] = PC[0] = BEQ_MCA_RdDatAddr[25] + * NA[29] = sid[0] = bank[4] = BEQ_MCA_RdDatAddr[26] + * NA[30] = sid[1] = bank[5] = BEQ_MCA_RdDatAddr[27] + * Basically, it calculates a locations to fit as shown in + * table umc_mca2na_mapping[]. + * + * XORs need to be applied based on the hash settings below. + */ + + /* Calculate column and row */ + col = FIELD_GET(GENMASK(5, 1), mca_addr); + row = FIELD_GET(GENMASK(23, 10), mca_addr); + + /* Apply hashing on below banks for bank calculation */ + for (i = 0; i < 4; i++) + bank_hash[i] = (mca_addr >> (6 + i)) & 0x1; + + /* bank hash algorithm */ + for (i = 0; i < 4; i++) { + /* Read AMD PPR UMC::AddrHashBank register*/ + if (!amd_smn_read(nid, gpu_umc_base + 0xC8 + (i * 4), &temp)) { + addr_hash_bank[i].xor_enable = temp & 1; + addr_hash_bank[i].col_xor = FIELD_GET(GENMASK(13, 1), temp); + addr_hash_bank[i].row_xor = FIELD_GET(GENMASK(31, 14), temp); + /* bank hash selection */ + bank[i] = bank_hash[i] ^ (addr_hash_bank[i].xor_enable & + (internal_bit_wise_xor(col & addr_hash_bank[i].col_xor) ^ + internal_bit_wise_xor(row & addr_hash_bank[i].row_xor))); + } + } + + /* To apply hash on pc bit */ + pc_hash = (mca_addr >> 25) & 0x1; + + /* Read AMD PPR UMC::CH::AddrHashPC register */ + if (!amd_smn_read(nid, gpu_umc_base + 0xE0, &temp)) { + addr_hash_pc.xor_enable = temp & 1; + addr_hash_pc.col_xor = FIELD_GET(GENMASK(13, 1), temp); + addr_hash_pc.row_xor = FIELD_GET(GENMASK(31, 14), temp); + } + /* Read AMD PPR UMC::CH::AddrHashPC2 register*/ + if (!amd_smn_read(nid, gpu_umc_base + 0xE4, &temp)) + addr_hash_pc2.bank_xor = FIELD_GET(GENMASK(5, 0), temp); + + /* Calculate bank value from bank[0..3], bank[4] and bank[5] */ + for (i = 0; i < 4; i++) + rawbank |= (bank[i] & 1) << i; + + rawbank |= (mca_addr >> 22) & 0x30; + + /* pseudochannel(pc) hash selection */ + pc = pc_hash ^ (addr_hash_pc.xor_enable & + (internal_bit_wise_xor(col & addr_hash_pc.col_xor) ^ + internal_bit_wise_xor(row & addr_hash_pc.row_xor) ^ + internal_bit_wise_xor(rawbank & addr_hash_pc2.bank_xor))); + + /* Mask b'25(pc_bit) and b'[9:6](bank) */ + mca_addr &= ~0x20003c0ULL; + + for (i = 0; i < 4; i++) + mca_addr |= (bank[i] << (6 + i)); + + mca_addr |= (pc << 25); + + /* NA[4..0] is fixed */ + mca2na = 0x0; + /* convert mca error address to normalized address */ + for (i = 1; i < ARRAY_SIZE(umc_mca2na_mapping); i++) + mca2na |= ((mca_addr >> i) & 0x1) << umc_mca2na_mapping[i]; + + mca_addr = mca2na; + pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", mca_addr); + pr_emerg(HW_ERR "Error hit on Bank: %d Row: %d Column: %d\n", rawbank, row, col); + + return mca_addr; +} + static void decode_umc_error(int node_id, struct mce *m) { u8 ecc_type = (m->status >> 45) & 0x3; @@ -3115,6 +3268,13 @@ static void decode_umc_error(int node_id, struct mce *m) pvt->ops->get_err_info(m, &err); df_inst_id = pvt->ops->get_inst_id(mci, pvt, &err); + /* + * The reported MCA address(Error Addr) is DRAM decoded address which needs to be + * converted to normalized address before DF address translation. + */ + if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f)) + m->addr = umc_ondie_addr_to_normaddr(m->addr, pvt->mc_node_id); + if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) { err.err_code = ERR_NORM_ADDR; goto log_error;