Message ID | 20210806074350.114614-2-nchatrad@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/edac/amd64: Add support for noncpu nodes | expand |
On Fri, Aug 06, 2021 at 01:13:48PM +0530, Naveen Krishna Chatradhi wrote: > From: Muralidhara M K <muralimk@amd.com> > > On newer systems the CPUs manage MCA errors reported from the GPUs. > Enumerate the GPU nodes with the AMD NB framework to support EDAC. > > This patch adds necessary code to manage the Aldebaran nodes along with > the CPU nodes. > > The GPU nodes are enumerated in sequential order based on the > PCI hierarchy, and the first GPU node is assumed to have an "AMD Node > ID" value of 8 (the second GPU node has 9, etc.). Each Aldebaran GPU > package has 2 Data Fabrics, which are enumerated as 2 nodes. > With this implementation detail, the Data Fabric on the GPU nodes can be > accessed the same way as the Data Fabric on CPU nodes. > > Signed-off-by: Muralidhara M K <muralimk@amd.com> > Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com> > --- > Changes since v1: > 1. Modified the commit message and comments in the code > 2. Squashed patch 1/7: "x86/amd_nb: Add Aldebaran device to PCI IDs" It's nice to have a link or links to previous patches here. For example, https://lkml.kernel.org/r/<Message-ID> > > arch/x86/include/asm/amd_nb.h | 10 ++++++ > arch/x86/kernel/amd_nb.c | 63 ++++++++++++++++++++++++++++++++--- > include/linux/pci_ids.h | 1 + > 3 files changed, 69 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h > index 00d1a400b7a1..f15247422992 100644 > --- a/arch/x86/include/asm/amd_nb.h > +++ b/arch/x86/include/asm/amd_nb.h > @@ -79,6 +79,16 @@ struct amd_northbridge_info { > > #ifdef CONFIG_AMD_NB > > +/* > + * On newer heterogeneous systems the data gabrics of the CPUs and GPUs > + * are connected directly via a custom links, like is done with > + * 2 socket CPU systems and also within a socket for Multi-chip Module > + * (MCM) CPUs like Naples. > + * The first GPU node(non cpu) is assumed to have an "AMD Node ID" value > + * of 8 (the second GPU node has 9, etc.). > + */ > +#define NONCPU_NODE_INDEX 8 > + > u16 amd_nb_num(void); > bool amd_nb_has_feature(unsigned int feature); > struct amd_northbridge *node_to_amd_nb(int node); > diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c > index 5884dfa619ff..5597135a18b5 100644 > --- a/arch/x86/kernel/amd_nb.c > +++ b/arch/x86/kernel/amd_nb.c > @@ -26,6 +26,8 @@ > #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 > #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 > #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e > +#define PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT 0x14bb > +#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4 0x14d4 > > /* Protect the PCI config register pairs used for SMN. */ > static DEFINE_MUTEX(smn_mutex); > @@ -94,6 +96,21 @@ static const struct pci_device_id hygon_nb_link_ids[] = { > {} > }; > > +static const struct pci_device_id amd_noncpu_root_ids[] = { > + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT) }, > + {} > +}; > + > +static const struct pci_device_id amd_noncpu_nb_misc_ids[] = { > + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3) }, > + {} > +}; > + > +static const struct pci_device_id amd_noncpu_nb_link_ids[] = { > + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4) }, > + {} > +}; > + > const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { > { 0x00, 0x18, 0x20 }, > { 0xff, 0x00, 0x20 }, > @@ -182,11 +199,16 @@ int amd_cache_northbridges(void) > const struct pci_device_id *misc_ids = amd_nb_misc_ids; > const struct pci_device_id *link_ids = amd_nb_link_ids; > const struct pci_device_id *root_ids = amd_root_ids; > + > + const struct pci_device_id *noncpu_misc_ids = amd_noncpu_nb_misc_ids; > + const struct pci_device_id *noncpu_link_ids = amd_noncpu_nb_link_ids; > + const struct pci_device_id *noncpu_root_ids = amd_noncpu_root_ids; > + > struct pci_dev *root, *misc, *link; > struct amd_northbridge *nb; > u16 roots_per_misc = 0; > - u16 misc_count = 0; > - u16 root_count = 0; > + u16 misc_count = 0, misc_count_noncpu = 0; > + u16 root_count = 0, root_count_noncpu = 0; > u16 i, j; > > if (amd_northbridges.num) > @@ -205,10 +227,16 @@ int amd_cache_northbridges(void) > if (!misc_count) > return -ENODEV; > > + while ((misc = next_northbridge(misc, noncpu_misc_ids)) != NULL) > + misc_count_noncpu++; > + > root = NULL; > while ((root = next_northbridge(root, root_ids)) != NULL) > root_count++; > > + while ((root = next_northbridge(root, noncpu_root_ids)) != NULL) > + root_count_noncpu++; > + > if (root_count) { > roots_per_misc = root_count / misc_count; > > @@ -222,15 +250,28 @@ int amd_cache_northbridges(void) > } > } > > - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); > + if (misc_count_noncpu) { > + /* > + * The first non-CPU Node ID starts at 8 even if there are fewer > + * than 8 CPU nodes. To maintain the AMD Node ID to Linux amd_nb > + * indexing scheme, allocate the number of GPU nodes plus 8. > + * Some allocated amd_northbridge structures will go unused when > + * the number of CPU nodes is less than 8, but this tradeoff is to > + * keep things relatively simple. > + */ > + amd_northbridges.num = NONCPU_NODE_INDEX + misc_count_noncpu; > + } else { > + amd_northbridges.num = misc_count; > + } > + > + nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); > if (!nb) > return -ENOMEM; > > amd_northbridges.nb = nb; > - amd_northbridges.num = misc_count; > > link = misc = root = NULL; > - for (i = 0; i < amd_northbridges.num; i++) { > + for (i = 0; i < misc_count; i++) { > node_to_amd_nb(i)->root = root = > next_northbridge(root, root_ids); > node_to_amd_nb(i)->misc = misc = > @@ -251,6 +292,18 @@ int amd_cache_northbridges(void) > root = next_northbridge(root, root_ids); > } > > + if (misc_count_noncpu) { > + link = misc = root = NULL; > + for (i = NONCPU_NODE_INDEX; i < NONCPU_NODE_INDEX + misc_count_noncpu; i++) { > + node_to_amd_nb(i)->root = root = > + next_northbridge(root, noncpu_root_ids); > + node_to_amd_nb(i)->misc = misc = > + next_northbridge(misc, noncpu_misc_ids); > + node_to_amd_nb(i)->link = link = > + next_northbridge(link, noncpu_link_ids); > + } > + } > + > if (amd_gart_present()) > amd_northbridges.flags |= AMD_NB_GART; > > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h > index 4bac1831de80..d9aae90dfce9 100644 > --- a/include/linux/pci_ids.h > +++ b/include/linux/pci_ids.h > @@ -554,6 +554,7 @@ > #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 > #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b > #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 > +#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3 0x14d3 > #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 > #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d > #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 > -- Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com> Thanks, Yazen
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00d1a400b7a1..f15247422992 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -79,6 +79,16 @@ struct amd_northbridge_info { #ifdef CONFIG_AMD_NB +/* + * On newer heterogeneous systems the data gabrics of the CPUs and GPUs + * are connected directly via a custom links, like is done with + * 2 socket CPU systems and also within a socket for Multi-chip Module + * (MCM) CPUs like Naples. + * The first GPU node(non cpu) is assumed to have an "AMD Node ID" value + * of 8 (the second GPU node has 9, etc.). + */ +#define NONCPU_NODE_INDEX 8 + u16 amd_nb_num(void); bool amd_nb_has_feature(unsigned int feature); struct amd_northbridge *node_to_amd_nb(int node); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 5884dfa619ff..5597135a18b5 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -26,6 +26,8 @@ #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT 0x14bb +#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4 0x14d4 /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -94,6 +96,21 @@ static const struct pci_device_id hygon_nb_link_ids[] = { {} }; +static const struct pci_device_id amd_noncpu_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT) }, + {} +}; + +static const struct pci_device_id amd_noncpu_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3) }, + {} +}; + +static const struct pci_device_id amd_noncpu_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4) }, + {} +}; + const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { 0x00, 0x18, 0x20 }, { 0xff, 0x00, 0x20 }, @@ -182,11 +199,16 @@ int amd_cache_northbridges(void) const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; const struct pci_device_id *root_ids = amd_root_ids; + + const struct pci_device_id *noncpu_misc_ids = amd_noncpu_nb_misc_ids; + const struct pci_device_id *noncpu_link_ids = amd_noncpu_nb_link_ids; + const struct pci_device_id *noncpu_root_ids = amd_noncpu_root_ids; + struct pci_dev *root, *misc, *link; struct amd_northbridge *nb; u16 roots_per_misc = 0; - u16 misc_count = 0; - u16 root_count = 0; + u16 misc_count = 0, misc_count_noncpu = 0; + u16 root_count = 0, root_count_noncpu = 0; u16 i, j; if (amd_northbridges.num) @@ -205,10 +227,16 @@ int amd_cache_northbridges(void) if (!misc_count) return -ENODEV; + while ((misc = next_northbridge(misc, noncpu_misc_ids)) != NULL) + misc_count_noncpu++; + root = NULL; while ((root = next_northbridge(root, root_ids)) != NULL) root_count++; + while ((root = next_northbridge(root, noncpu_root_ids)) != NULL) + root_count_noncpu++; + if (root_count) { roots_per_misc = root_count / misc_count; @@ -222,15 +250,28 @@ int amd_cache_northbridges(void) } } - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + if (misc_count_noncpu) { + /* + * The first non-CPU Node ID starts at 8 even if there are fewer + * than 8 CPU nodes. To maintain the AMD Node ID to Linux amd_nb + * indexing scheme, allocate the number of GPU nodes plus 8. + * Some allocated amd_northbridge structures will go unused when + * the number of CPU nodes is less than 8, but this tradeoff is to + * keep things relatively simple. + */ + amd_northbridges.num = NONCPU_NODE_INDEX + misc_count_noncpu; + } else { + amd_northbridges.num = misc_count; + } + + nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; - amd_northbridges.num = misc_count; link = misc = root = NULL; - for (i = 0; i < amd_northbridges.num; i++) { + for (i = 0; i < misc_count; i++) { node_to_amd_nb(i)->root = root = next_northbridge(root, root_ids); node_to_amd_nb(i)->misc = misc = @@ -251,6 +292,18 @@ int amd_cache_northbridges(void) root = next_northbridge(root, root_ids); } + if (misc_count_noncpu) { + link = misc = root = NULL; + for (i = NONCPU_NODE_INDEX; i < NONCPU_NODE_INDEX + misc_count_noncpu; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, noncpu_root_ids); + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, noncpu_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, noncpu_link_ids); + } + } + if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4bac1831de80..d9aae90dfce9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -554,6 +554,7 @@ #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 +#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3 0x14d3 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703