Message ID | 20231025051455.101424-5-muralimk@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Few cleanups and AMD Family 19h Models 90h-9fh EDAC Support | expand |
Hi Boris, On 10/27/2023 8:15 PM, Borislav Petkov wrote: > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > > On Wed, Oct 25, 2023 at 05:14:55AM +0000, Muralidhara M K wrote: >> Subject: Re: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support > > "Add support for family 0x19, models 0x90-9f devices" > >> From: Muralidhara M K <muralidhara.mk@amd.com> >> >> AMD Models 90h-9fh are APUs, They have built-in HBM3 memory. > > s/,/./ > >> ECC support is enabled by default. >> >> APU models have a single Data Fabric (DF) per Package. Each DF is >> visible to the OS in the same way as chiplet-based systems like >> Rome and later. However, the Unified Memory Controllers (UMCs) are > > s/Rome/Zen2 CPUs/ > >> arranged in the same way as GPU-based MI200 devices rather than >> CPU-based systems. >> >> So, use the gpu_ops for enumeration and adds a few fixups to >> support enumeration of nodes and memory topology. > > Do not talk about *what* the patch is doing in the commit message - that > should be obvious from the diff itself. Rather, concentrate on the *why* > it needs to be done. > >> Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> >> --- >> drivers/edac/amd64_edac.c | 69 ++++++++++++++++++++++++++++++++------- >> 1 file changed, 57 insertions(+), 12 deletions(-) >> >> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c >> index 9b6642d00871..82302393894c 100644 >> --- a/drivers/edac/amd64_edac.c >> +++ b/drivers/edac/amd64_edac.c >> @@ -996,12 +996,19 @@ static struct local_node_map { >> #define LNTM_NODE_COUNT GENMASK(27, 16) >> #define LNTM_BASE_NODE_ID GENMASK(11, 0) >> >> -static int gpu_get_node_map(void) >> +static int gpu_get_node_map(struct amd64_pvt *pvt) >> { >> struct pci_dev *pdev; >> int ret; >> u32 tmp; >> >> + /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical > > verify_comment_style: WARNING: Multi-line comment needs to start text on the second line: > [+ /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical] > >> + * one is applicable for MI200 models. >> + * Therefore return early for other heterogeneous systems. >> + */ >> + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3) >> + return 0; >> + >> /* >> * Node ID 0 is reserved for CPUs. >> * Therefore, a non-zero Node ID means we've already cached the values. >> @@ -3851,7 +3858,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci) >> >> dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs); >> dimm->edac_mode = EDAC_SECDED; >> - dimm->mtype = MEM_HBM2; >> + dimm->mtype = pvt->dram_type; >> dimm->dtype = DEV_X16; >> dimm->grain = 64; >> } >> @@ -3880,6 +3887,9 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt) >> return true; >> } >> >> +/* Base address used for channels selection on MI200 GPUs */ >> +static u32 gpu_umc_base = 0x50000; > > That gpu_umc_base assignment can happen in per_family_init() too. > Sure. will modify >> + >> static inline u32 gpu_get_umc_base(u8 umc, u8 channel) >> { >> /* >> @@ -3893,13 +3903,33 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel) >> * On GPU nodes channels are selected in 3rd nibble >> * HBM chX[3:0]= [Y ]5X[3:0]000; >> * HBM chX[7:4]= [Y+1]5X[3:0]000 >> + * >> + * On MI300 APU nodes, same as GPU nodes but channels are selected >> + * in the base address of 0x90000 >> */ >> umc *= 2; >> >> if (channel >= 4) >> umc++; >> >> - return 0x50000 + (umc << 20) + ((channel % 4) << 12); >> + return gpu_umc_base + (umc << 20) + ((channel % 4) << 12); >> +} >> + >> +static void gpu_determine_memory_type(struct amd64_pvt *pvt) >> +{ >> + if (pvt->fam == 0x19) { >> + switch (pvt->model) { >> + case 0x30 ... 0x3F: >> + pvt->dram_type = MEM_HBM2; >> + break; >> + case 0x90 ... 0x9F: >> + pvt->dram_type = MEM_HBM3; >> + break; >> + default: >> + break; >> + } >> + } >> + edac_dbg(1, " MEM type: %s\n", edac_mem_types[pvt->dram_type]); > > That whole assignment can happen in per_family_init() - no need for that > function here. > Thanks. got it >> static void gpu_read_mc_regs(struct amd64_pvt *pvt) >> @@ -3960,7 +3990,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) >> { >> int ret; >> >> - ret = gpu_get_node_map(); >> + ret = gpu_get_node_map(pvt); >> if (ret) >> return ret; >> >> @@ -3971,6 +4001,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) >> gpu_prep_chip_selects(pvt); >> gpu_read_base_mask(pvt); >> gpu_read_mc_regs(pvt); >> + gpu_determine_memory_type(pvt); >> >> return 0; >> } >> @@ -4142,6 +4173,12 @@ static int per_family_init(struct amd64_pvt *pvt) >> pvt->ctl_name = "F19h_M70h"; >> pvt->flags.zn_regs_v2 = 1; >> break; >> + case 0x90 ... 0x9f: >> + pvt->ctl_name = "F19h_M90h"; >> + pvt->max_mcs = 4; >> + gpu_umc_base = 0x90000; >> + pvt->ops = &gpu_ops; >> + break; >> case 0xa0 ... 0xaf: >> pvt->ctl_name = "F19h_MA0h"; >> pvt->max_mcs = 12; >> @@ -4180,23 +4217,31 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { >> NULL >> }; >> >> +/* >> + * For Heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers > > s/Heterogeneous/heterogeneous/ > >> + * should be swapped to fit into the layers. >> + */ >> +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer) >> +{ >> + bool is_gpu = (pvt->ops == &gpu_ops); >> + >> + if (!layer) >> + return is_gpu ? pvt->max_mcs : pvt->csels[0].b_cnt; >> + >> + return is_gpu ? pvt->csels[0].b_cnt : pvt->max_mcs; > > Balance that for better readability: > > if (!layer) > return is_gpu ? pvt->max_mcs > : pvt->csels[0].b_cnt; > else > return is_gpu ? pvt->csels[0].b_cnt > : pvt->max_mcs; > > Sure understood. will modify > Thx. > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette >
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9b6642d00871..82302393894c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -996,12 +996,19 @@ static struct local_node_map { #define LNTM_NODE_COUNT GENMASK(27, 16) #define LNTM_BASE_NODE_ID GENMASK(11, 0) -static int gpu_get_node_map(void) +static int gpu_get_node_map(struct amd64_pvt *pvt) { struct pci_dev *pdev; int ret; u32 tmp; + /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical + * one is applicable for MI200 models. + * Therefore return early for other heterogeneous systems. + */ + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3) + return 0; + /* * Node ID 0 is reserved for CPUs. * Therefore, a non-zero Node ID means we've already cached the values. @@ -3851,7 +3858,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci) dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs); dimm->edac_mode = EDAC_SECDED; - dimm->mtype = MEM_HBM2; + dimm->mtype = pvt->dram_type; dimm->dtype = DEV_X16; dimm->grain = 64; } @@ -3880,6 +3887,9 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt) return true; } +/* Base address used for channels selection on MI200 GPUs */ +static u32 gpu_umc_base = 0x50000; + static inline u32 gpu_get_umc_base(u8 umc, u8 channel) { /* @@ -3893,13 +3903,33 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel) * On GPU nodes channels are selected in 3rd nibble * HBM chX[3:0]= [Y ]5X[3:0]000; * HBM chX[7:4]= [Y+1]5X[3:0]000 + * + * On MI300 APU nodes, same as GPU nodes but channels are selected + * in the base address of 0x90000 */ umc *= 2; if (channel >= 4) umc++; - return 0x50000 + (umc << 20) + ((channel % 4) << 12); + return gpu_umc_base + (umc << 20) + ((channel % 4) << 12); +} + +static void gpu_determine_memory_type(struct amd64_pvt *pvt) +{ + if (pvt->fam == 0x19) { + switch (pvt->model) { + case 0x30 ... 0x3F: + pvt->dram_type = MEM_HBM2; + break; + case 0x90 ... 0x9F: + pvt->dram_type = MEM_HBM3; + break; + default: + break; + } + } + edac_dbg(1, " MEM type: %s\n", edac_mem_types[pvt->dram_type]); } static void gpu_read_mc_regs(struct amd64_pvt *pvt) @@ -3960,7 +3990,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) { int ret; - ret = gpu_get_node_map(); + ret = gpu_get_node_map(pvt); if (ret) return ret; @@ -3971,6 +4001,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) gpu_prep_chip_selects(pvt); gpu_read_base_mask(pvt); gpu_read_mc_regs(pvt); + gpu_determine_memory_type(pvt); return 0; } @@ -4142,6 +4173,12 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F19h_M70h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F19h_M90h"; + pvt->max_mcs = 4; + gpu_umc_base = 0x90000; + pvt->ops = &gpu_ops; + break; case 0xa0 ... 0xaf: pvt->ctl_name = "F19h_MA0h"; pvt->max_mcs = 12; @@ -4180,23 +4217,31 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { NULL }; +/* + * For Heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers + * should be swapped to fit into the layers. + */ +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer) +{ + bool is_gpu = (pvt->ops == &gpu_ops); + + if (!layer) + return is_gpu ? pvt->max_mcs : pvt->csels[0].b_cnt; + + return is_gpu ? pvt->csels[0].b_cnt : pvt->max_mcs; +} + static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; int ret = -ENOMEM; - /* - * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should - * be swapped to fit into the layers. - */ layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; - layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->max_mcs : pvt->csels[0].b_cnt; + layers[0].size = get_layer_size(pvt, 0); layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->csels[0].b_cnt : pvt->max_mcs; + layers[1].size = get_layer_size(pvt, 1); layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);