diff mbox series

EDAC/{i10nm,skx,skx_common}: Support multiple clumps

Message ID 20241205165954.7957-1-kyle.meyer@hpe.com (mailing list archive)
State New
Headers show
Series EDAC/{i10nm,skx,skx_common}: Support multiple clumps | expand

Commit Message

Kyle Meyer Dec. 5, 2024, 4:59 p.m. UTC
The 3-bit source IDs in PCI configuration space registers are limited to
8 unique IDs, and each ID is local to a clump (UPI/QPI domain).

Source IDs can not be used to map devices to sockets on systems with
multiple clumps because each clump has identical repeating source IDs.

Get package IDs instead of source IDs on systems with multiple clumps
and use package/source IDs to name IMC information structures.

Signed-off-by: Kyle Meyer <kyle.meyer@hpe.com>
---
 drivers/edac/i10nm_base.c | 21 +++++++++-------
 drivers/edac/skx_base.c   | 19 ++++++++------
 drivers/edac/skx_common.c | 52 +++++++++++++++++++++++++++++++++------
 drivers/edac/skx_common.h |  5 ++--
 4 files changed, 71 insertions(+), 26 deletions(-)

Comments

Luck, Tony Dec. 5, 2024, 7:13 p.m. UTC | #1
On Thu, Dec 05, 2024 at 10:59:54AM -0600, Kyle Meyer wrote:
> The 3-bit source IDs in PCI configuration space registers are limited to
> 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).

Is there any better name than "clump"?
> 
> Source IDs can not be used to map devices to sockets on systems with
> multiple clumps because each clump has identical repeating source IDs.
> 
> Get package IDs instead of source IDs on systems with multiple clumps
> and use package/source IDs to name IMC information structures.

What would happen if you just assumed the system had clumps and you
always used package/source? Would that change EDAC naming for
existing systems? That would be less complexity for the driver.

> Signed-off-by: Kyle Meyer <kyle.meyer@hpe.com>
> ---
>  drivers/edac/i10nm_base.c | 21 +++++++++-------
>  drivers/edac/skx_base.c   | 19 ++++++++------
>  drivers/edac/skx_common.c | 52 +++++++++++++++++++++++++++++++++------
>  drivers/edac/skx_common.h |  5 ++--
>  4 files changed, 71 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> index 51556c72a967..59384677d025 100644
> --- a/drivers/edac/i10nm_base.c
> +++ b/drivers/edac/i10nm_base.c
> @@ -1010,7 +1010,7 @@ static struct notifier_block i10nm_mce_dec = {
>  
>  static int __init i10nm_init(void)
>  {
> -	u8 mc = 0, src_id = 0, node_id = 0;
> +	u8 mc = 0, src_id = 0;
>  	const struct x86_cpu_id *id;
>  	struct res_config *cfg;
>  	const char *owner;
> @@ -1018,6 +1018,7 @@ static int __init i10nm_init(void)
>  	int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
>  	u64 tolm, tohm;
>  	int imc_num;
> +	int dup_src_ids = 0;
>  
>  	edac_dbg(2, "\n");
>  
> @@ -1065,24 +1066,26 @@ static int __init i10nm_init(void)
>  
>  	imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;
>  
> -	list_for_each_entry(d, i10nm_edac_list, list) {
> -		rc = skx_get_src_id(d, 0xf8, &src_id);
> -		if (rc < 0)
> -			goto fail;
> +	rc = dup_src_ids = skx_check_dup_src_ids(0xf8);

Checkpatch complains about this: "multiple assignments should be
avoided"

> +	if (rc < 0)
> +		goto fail;
>  
> -		rc = skx_get_node_id(d, &node_id);
> +	list_for_each_entry(d, i10nm_edac_list, list) {
> +		if (dup_src_ids)
> +			rc = skx_get_pkg_id(d, &src_id);
> +		else
> +			rc = skx_get_src_id(d, 0xf8, &src_id);
>  		if (rc < 0)
>  			goto fail;
>  
> -		edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id);
> +		edac_dbg(2, "src_id = %d\n", src_id);
>  		for (i = 0; i < imc_num; i++) {
>  			if (!d->imc[i].mdev)
>  				continue;
>  
>  			d->imc[i].mc  = mc++;
>  			d->imc[i].lmc = i;
> -			d->imc[i].src_id  = src_id;
> -			d->imc[i].node_id = node_id;
> +			d->imc[i].src_id = src_id;
>  			if (d->imc[i].hbm_mc) {
>  				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
>  				d->imc[i].num_channels = cfg->hbm_chan_num;
> diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
> index 14cfd394b469..189b8c5a1bda 100644
> --- a/drivers/edac/skx_base.c
> +++ b/drivers/edac/skx_base.c
> @@ -600,8 +600,9 @@ static int __init skx_init(void)
>  	const struct munit *m;
>  	const char *owner;
>  	int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
> -	u8 mc = 0, src_id, node_id;
> +	u8 mc = 0, src_id;
>  	struct skx_dev *d;
> +	int dup_src_ids = 0;
>  
>  	edac_dbg(2, "\n");
>  
> @@ -646,19 +647,23 @@ static int __init skx_init(void)
>  		}
>  	}
>  
> +	rc = dup_src_ids = skx_check_dup_src_ids(0xf0);

Checkpatch complains about this: "multiple assignments should be
avoided"

-Tony
Kyle Meyer Dec. 5, 2024, 8:05 p.m. UTC | #2
On Thu, Dec 05, 2024 at 11:13:23AM -0800, Luck, Tony wrote:
> On Thu, Dec 05, 2024 at 10:59:54AM -0600, Kyle Meyer wrote:
> > The 3-bit source IDs in PCI configuration space registers are limited to
> > 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
> 
> Is there any better name than "clump"?

Yes, a UPI/QPI domain.

> > Source IDs can not be used to map devices to sockets on systems with
> > multiple clumps because each clump has identical repeating source IDs.
> > 
> > Get package IDs instead of source IDs on systems with multiple clumps
> > and use package/source IDs to name IMC information structures.
> 
> What would happen if you just assumed the system had clumps and you
> always used package/source? Would that change EDAC naming for
> existing systems? That would be less complexity for the driver.

That works if NUMA is enabled. skx_get_pkg_id() uses NUMA information to
determine the package ID.
 
> > Signed-off-by: Kyle Meyer <kyle.meyer@hpe.com>
> > ---
> >  drivers/edac/i10nm_base.c | 21 +++++++++-------
> >  drivers/edac/skx_base.c   | 19 ++++++++------
> >  drivers/edac/skx_common.c | 52 +++++++++++++++++++++++++++++++++------
> >  drivers/edac/skx_common.h |  5 ++--
> >  4 files changed, 71 insertions(+), 26 deletions(-)
> > 
> > diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> > index 51556c72a967..59384677d025 100644
> > --- a/drivers/edac/i10nm_base.c
> > +++ b/drivers/edac/i10nm_base.c
> > @@ -1010,7 +1010,7 @@ static struct notifier_block i10nm_mce_dec = {
> >  
> >  static int __init i10nm_init(void)
> >  {
> > -	u8 mc = 0, src_id = 0, node_id = 0;
> > +	u8 mc = 0, src_id = 0;
> >  	const struct x86_cpu_id *id;
> >  	struct res_config *cfg;
> >  	const char *owner;
> > @@ -1018,6 +1018,7 @@ static int __init i10nm_init(void)
> >  	int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
> >  	u64 tolm, tohm;
> >  	int imc_num;
> > +	int dup_src_ids = 0;
> >  
> >  	edac_dbg(2, "\n");
> >  
> > @@ -1065,24 +1066,26 @@ static int __init i10nm_init(void)
> >  
> >  	imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;
> >  
> > -	list_for_each_entry(d, i10nm_edac_list, list) {
> > -		rc = skx_get_src_id(d, 0xf8, &src_id);
> > -		if (rc < 0)
> > -			goto fail;
> > +	rc = dup_src_ids = skx_check_dup_src_ids(0xf8);
> 
> Checkpatch complains about this: "multiple assignments should be
> avoided"
> 
> > +	if (rc < 0)
> > +		goto fail;
> >  
> > -		rc = skx_get_node_id(d, &node_id);
> > +	list_for_each_entry(d, i10nm_edac_list, list) {
> > +		if (dup_src_ids)
> > +			rc = skx_get_pkg_id(d, &src_id);
> > +		else
> > +			rc = skx_get_src_id(d, 0xf8, &src_id);
> >  		if (rc < 0)
> >  			goto fail;
> >  
> > -		edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id);
> > +		edac_dbg(2, "src_id = %d\n", src_id);
> >  		for (i = 0; i < imc_num; i++) {
> >  			if (!d->imc[i].mdev)
> >  				continue;
> >  
> >  			d->imc[i].mc  = mc++;
> >  			d->imc[i].lmc = i;
> > -			d->imc[i].src_id  = src_id;
> > -			d->imc[i].node_id = node_id;
> > +			d->imc[i].src_id = src_id;
> >  			if (d->imc[i].hbm_mc) {
> >  				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
> >  				d->imc[i].num_channels = cfg->hbm_chan_num;
> > diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
> > index 14cfd394b469..189b8c5a1bda 100644
> > --- a/drivers/edac/skx_base.c
> > +++ b/drivers/edac/skx_base.c
> > @@ -600,8 +600,9 @@ static int __init skx_init(void)
> >  	const struct munit *m;
> >  	const char *owner;
> >  	int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
> > -	u8 mc = 0, src_id, node_id;
> > +	u8 mc = 0, src_id;
> >  	struct skx_dev *d;
> > +	int dup_src_ids = 0;
> >  
> >  	edac_dbg(2, "\n");
> >  
> > @@ -646,19 +647,23 @@ static int __init skx_init(void)
> >  		}
> >  	}
> >  
> > +	rc = dup_src_ids = skx_check_dup_src_ids(0xf0);
> 
> Checkpatch complains about this: "multiple assignments should be
> avoided"

That's strange, my scripts/checkpatch.pl didn't complain. I'll avoid that going
forward.

Thanks,
Kyle Meyer
Luck, Tony Dec. 5, 2024, 10:52 p.m. UTC | #3
On Thu, Dec 05, 2024 at 02:05:36PM -0600, Kyle Meyer wrote:
> On Thu, Dec 05, 2024 at 11:13:23AM -0800, Luck, Tony wrote:
> > On Thu, Dec 05, 2024 at 10:59:54AM -0600, Kyle Meyer wrote:
> > > The 3-bit source IDs in PCI configuration space registers are limited to
> > > 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
> > 
> > Is there any better name than "clump"?
> 
> Yes, a UPI/QPI domain.

Ok, let's use that.

> > > Source IDs can not be used to map devices to sockets on systems with
> > > multiple clumps because each clump has identical repeating source IDs.
> > > 
> > > Get package IDs instead of source IDs on systems with multiple clumps
> > > and use package/source IDs to name IMC information structures.
> > 
> > What would happen if you just assumed the system had clumps and you
> > always used package/source? Would that change EDAC naming for
> > existing systems? That would be less complexity for the driver.
> 
> That works if NUMA is enabled. skx_get_pkg_id() uses NUMA information to
> determine the package ID.

I cobbled together this patch on top of yours. I think it is
functionally equivalent. The #ifdef CONFIG_NUMA in the ".c"
file is ugly, but serves to illustrate what I'm trying to do
without too much cleanup noise.

1) Does this work? I tried on a non-clumpy system that is NUMA.

2) Is it better (assuming #fidef factored off into a .h file)?


-Tony

---

diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 0f06d45c9b3e..b0845bdd4516 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -244,8 +244,6 @@ void skx_set_mem_cfg(bool mem_cfg_2lm);
 void skx_set_res_cfg(struct res_config *cfg);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
-int skx_check_dup_src_ids(int off);
-int skx_get_pkg_id(struct skx_dev *d, u8 *id);
 
 int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);
 
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 59384677d025..16d1110c0692 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -1018,7 +1018,6 @@ static int __init i10nm_init(void)
 	int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
 	u64 tolm, tohm;
 	int imc_num;
-	int dup_src_ids = 0;
 
 	edac_dbg(2, "\n");
 
@@ -1066,15 +1065,8 @@ static int __init i10nm_init(void)
 
 	imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;
 
-	rc = dup_src_ids = skx_check_dup_src_ids(0xf8);
-	if (rc < 0)
-		goto fail;
-
 	list_for_each_entry(d, i10nm_edac_list, list) {
-		if (dup_src_ids)
-			rc = skx_get_pkg_id(d, &src_id);
-		else
-			rc = skx_get_src_id(d, 0xf8, &src_id);
+		rc = skx_get_src_id(d, 0xf8, &src_id);
 		if (rc < 0)
 			goto fail;
 
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 189b8c5a1bda..93f7c05faccc 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -602,7 +602,6 @@ static int __init skx_init(void)
 	int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
 	u8 mc = 0, src_id;
 	struct skx_dev *d;
-	int dup_src_ids = 0;
 
 	edac_dbg(2, "\n");
 
@@ -647,15 +646,8 @@ static int __init skx_init(void)
 		}
 	}
 
-	rc = dup_src_ids = skx_check_dup_src_ids(0xf0);
-	if (rc < 0)
-		goto fail;
-
 	list_for_each_entry(d, skx_edac_list, list) {
-		if (dup_src_ids)
-			rc = skx_get_pkg_id(d, &src_id);
-		else
-			rc = skx_get_src_id(d, 0xf0, &src_id);
+		rc = skx_get_src_id(d, 0xf0, &src_id);
 		if (rc < 0)
 			goto fail;
 
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 56fec7310f40..414a1e6cf0f5 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -221,49 +221,7 @@ void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 }
 EXPORT_SYMBOL_GPL(skx_set_decode);
 
-int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
-{
-	u32 reg;
-
-	if (pci_read_config_dword(d->util_all, off, &reg)) {
-		skx_printk(KERN_ERR, "Failed to read src id\n");
-		return -ENODEV;
-	}
-
-	*id = GET_BITFIELD(reg, 12, 14);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(skx_get_src_id);
-
-int skx_check_dup_src_ids(int off)
-{
-	u8 id;
-	struct skx_dev *d;
-	int rc;
-	DECLARE_BITMAP(id_map, 8);
-
-	bitmap_zero(id_map, 8);
-
-	/*
-	 * The 3-bit source IDs in PCI configuration space registers are limited
-	 * to 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
-	 */
-	list_for_each_entry(d, &dev_edac_list, list) {
-		rc = skx_get_src_id(d, off, &id);
-		if (rc < 0)
-			return rc;
-
-		if (test_bit(id, id_map))
-			return 1;
-
-		set_bit(id, id_map);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(skx_check_dup_src_ids);
-
-int skx_get_pkg_id(struct skx_dev *d, u8 *id)
+static int skx_get_pkg_id(struct skx_dev *d, u8 *id)
 {
 	int node;
 	int cpu;
@@ -283,7 +241,24 @@ int skx_get_pkg_id(struct skx_dev *d, u8 *id)
 	skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
 	return -ENODEV;
 }
-EXPORT_SYMBOL_GPL(skx_get_pkg_id);
+
+int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
+{
+#ifdef CONFIG_NUMA
+	return skx_get_pkg_id(d, id);
+#else
+	u32 reg;
+
+	if (pci_read_config_dword(d->util_all, off, &reg)) {
+		skx_printk(KERN_ERR, "Failed to read src id\n");
+		return -ENODEV;
+	}
+
+	*id = GET_BITFIELD(reg, 12, 14);
+	return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(skx_get_src_id);
 
 static int get_width(u32 mtr)
 {
Luck, Tony Dec. 5, 2024, 11:57 p.m. UTC | #4
> +int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
> +{
> +#ifdef CONFIG_NUMA
> +	return skx_get_pkg_id(d, id);
> +#else
> +	u32 reg;
> +
> +	if (pci_read_config_dword(d->util_all, off, &reg)) {
> +		skx_printk(KERN_ERR, "Failed to read src id\n");
> +		return -ENODEV;
> +	}
> +
> +	*id = GET_BITFIELD(reg, 12, 14);
> +	return 0;
> +#endif

Doh ... I alwasy forget about IS_ENABLED(). This can be written:


int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
{
	u32 reg;

	if (IS_ENABLED(CONFIG_NUMA))
		return skx_get_pkg_id(d, id);

	if (pci_read_config_dword(d->util_all, off, &reg)) {
		skx_printk(KERN_ERR, "Failed to read src id\n");
		return -ENODEV;
	}

	*id = GET_BITFIELD(reg, 12, 14);
	return 0;
}

-Tony
Kyle Meyer Dec. 6, 2024, 12:57 a.m. UTC | #5
On Thu, Dec 05, 2024 at 03:57:55PM -0800, Luck, Tony wrote:
> > +int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
> > +{
> > +#ifdef CONFIG_NUMA
> > +	return skx_get_pkg_id(d, id);
> > +#else
> > +	u32 reg;
> > +
> > +	if (pci_read_config_dword(d->util_all, off, &reg)) {
> > +		skx_printk(KERN_ERR, "Failed to read src id\n");
> > +		return -ENODEV;
> > +	}
> > +
> > +	*id = GET_BITFIELD(reg, 12, 14);
> > +	return 0;
> > +#endif
> 
> Doh ... I alwasy forget about IS_ENABLED(). This can be written:
> 
> 
> int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
> {
> 	u32 reg;
> 
> 	if (IS_ENABLED(CONFIG_NUMA))
> 		return skx_get_pkg_id(d, id);
> 
> 	if (pci_read_config_dword(d->util_all, off, &reg)) {
> 		skx_printk(KERN_ERR, "Failed to read src id\n");
> 		return -ENODEV;
> 	}
> 
> 	*id = GET_BITFIELD(reg, 12, 14);
> 	return 0;
> }

Looks good.

> 1) Does this work? I tried on a non-clumpy system that is NUMA.

Yes, I just tested this on a Sapphire Rapids system with multiple UPI domains.

> 2) Is it better (assuming #fidef factored off into a .h file)?

IMO, yes, but there's one subtle difference. EDAC will not load on systems
that have a single UPI domain when CONFIG_NUMA is enabled but numa=off, because
pcibus_to_node() in skx_get_pkg_id() will return NUMA_NO_NODE (-1). Is that a
case that we need to worry about?

Thanks,
Kyle Meyer
Zhuo, Qiuxu Dec. 6, 2024, 1:26 a.m. UTC | #6
> From: Kyle Meyer <kyle.meyer@hpe.com>
> Sent: Friday, December 6, 2024 8:57 AM
> To: Luck, Tony <tony.luck@intel.com>
> Cc: bp@alien8.de; james.morse@arm.com; mchehab@kernel.org;
> rric@kernel.org; linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] EDAC/{i10nm,skx,skx_common}: Support multiple
> clumps
> 
> On Thu, Dec 05, 2024 at 03:57:55PM -0800, Luck, Tony wrote:
> > > +int skx_get_src_id(struct skx_dev *d, int off, u8 *id) { #ifdef
> > > +CONFIG_NUMA
> > > +	return skx_get_pkg_id(d, id);
> > > +#else
> > > +	u32 reg;
> > > +
> > > +	if (pci_read_config_dword(d->util_all, off, &reg)) {
> > > +		skx_printk(KERN_ERR, "Failed to read src id\n");
> > > +		return -ENODEV;
> > > +	}
> > > +
> > > +	*id = GET_BITFIELD(reg, 12, 14);
> > > +	return 0;
> > > +#endif
> >
> > Doh ... I alwasy forget about IS_ENABLED(). This can be written:
> >
> >
> > int skx_get_src_id(struct skx_dev *d, int off, u8 *id) {
> > 	u32 reg;
> >
> > 	if (IS_ENABLED(CONFIG_NUMA))
> > 		return skx_get_pkg_id(d, id);
> >
> > 	if (pci_read_config_dword(d->util_all, off, &reg)) {
> > 		skx_printk(KERN_ERR, "Failed to read src id\n");
> > 		return -ENODEV;
> > 	}
> >
> > 	*id = GET_BITFIELD(reg, 12, 14);
> > 	return 0;
> > }
> 
> Looks good.
> 
> > 1) Does this work? I tried on a non-clumpy system that is NUMA.
> 
> Yes, I just tested this on a Sapphire Rapids system with multiple UPI domains.
> 
> > 2) Is it better (assuming #fidef factored off into a .h file)?
> 
> IMO, yes, but there's one subtle difference. EDAC will not load on systems that
> have a single UPI domain when CONFIG_NUMA is enabled but numa=off,
> because
> pcibus_to_node() in skx_get_pkg_id() will return NUMA_NO_NODE (-1). Is
> that a case that we need to worry about?

I think we need to make the EDAC load/work even in this case. 
Regardless of CONFIG_NUMA or whether numa=off is set or not, could we do it like this:

int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
{
        u32 reg;

        if (!skx_get_pkg_id(d, id))
                return 0;

        if (pci_read_config_dword(d->util_all, off, &reg)) {
                skx_printk(KERN_ERR, "Failed to read src id\n");
                return -ENODEV;
        }

        *id = GET_BITFIELD(reg, 12, 14);
        return 0;
}

-Qiuxu
Kyle Meyer Dec. 6, 2024, 2:33 a.m. UTC | #7
On Fri, Dec 06, 2024 at 01:26:12AM +0000, Zhuo, Qiuxu wrote:
> > From: Kyle Meyer <kyle.meyer@hpe.com>
> > Sent: Friday, December 6, 2024 8:57 AM
> > To: Luck, Tony <tony.luck@intel.com>
> > Cc: bp@alien8.de; james.morse@arm.com; mchehab@kernel.org;
> > rric@kernel.org; linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH] EDAC/{i10nm,skx,skx_common}: Support multiple
> > clumps
> > 
> > On Thu, Dec 05, 2024 at 03:57:55PM -0800, Luck, Tony wrote:
> > > > +int skx_get_src_id(struct skx_dev *d, int off, u8 *id) { #ifdef
> > > > +CONFIG_NUMA
> > > > +	return skx_get_pkg_id(d, id);
> > > > +#else
> > > > +	u32 reg;
> > > > +
> > > > +	if (pci_read_config_dword(d->util_all, off, &reg)) {
> > > > +		skx_printk(KERN_ERR, "Failed to read src id\n");
> > > > +		return -ENODEV;
> > > > +	}
> > > > +
> > > > +	*id = GET_BITFIELD(reg, 12, 14);
> > > > +	return 0;
> > > > +#endif
> > >
> > > Doh ... I alwasy forget about IS_ENABLED(). This can be written:
> > >
> > >
> > > int skx_get_src_id(struct skx_dev *d, int off, u8 *id) {
> > > 	u32 reg;
> > >
> > > 	if (IS_ENABLED(CONFIG_NUMA))
> > > 		return skx_get_pkg_id(d, id);
> > >
> > > 	if (pci_read_config_dword(d->util_all, off, &reg)) {
> > > 		skx_printk(KERN_ERR, "Failed to read src id\n");
> > > 		return -ENODEV;
> > > 	}
> > >
> > > 	*id = GET_BITFIELD(reg, 12, 14);
> > > 	return 0;
> > > }
> > 
> > Looks good.
> > 
> > > 1) Does this work? I tried on a non-clumpy system that is NUMA.
> > 
> > Yes, I just tested this on a Sapphire Rapids system with multiple UPI domains.
> > 
> > > 2) Is it better (assuming #fidef factored off into a .h file)?
> > 
> > IMO, yes, but there's one subtle difference. EDAC will not load on systems that
> > have a single UPI domain when CONFIG_NUMA is enabled but numa=off,
> > because
> > pcibus_to_node() in skx_get_pkg_id() will return NUMA_NO_NODE (-1). Is
> > that a case that we need to worry about?
> 
> I think we need to make the EDAC load/work even in this case. 
> Regardless of CONFIG_NUMA or whether numa=off is set or not, could we do it like this:
> 
> int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
> {
>         u32 reg;
> 
>         if (!skx_get_pkg_id(d, id))
>                 return 0;
> 
>         if (pci_read_config_dword(d->util_all, off, &reg)) {
>                 skx_printk(KERN_ERR, "Failed to read src id\n");
>                 return -ENODEV;
>         }
> 
>         *id = GET_BITFIELD(reg, 12, 14);
>         return 0;
> }

So, we're back to the original issue on systems with multiple UPI/QPI domains
when NUMA is disabled.

Systems with multiple UPI/QPI domains can't use source IDs to map devices
to sockets. skx_get_src_id() will successfully read the source ID from PCI
configuration space registers but it might not map to the correct socket because
each UPI/QPI domain has identical repeating source IDs.

For example, 8 sockets with 2 UPI/QPI domains:

Socket 0 -> Source ID 0
Socket 1 -> Source ID 1
Socket 2 -> Source ID 2
Socket 3 -> Source ID 3
Socket 4 -> Source ID 0
Socket 5 -> Source ID 1
Socket 6 -> Source ID 2
Socket 7 -> Source ID 3

EDAC will successfully load, but it will not find the the corresponding device
for errors on sockets 4 though 7 (for example, see skx_common.c:178).

Looking at my original patch, EDAC will not load when a system has multiple UPI/
QPI domains and NUMA is disabled. We fail early with "Failed to get package ID
from NUMA information" instead of later when an error occurs.

Thanks,
Kyle Meyer
Luck, Tony Dec. 6, 2024, 9:24 p.m. UTC | #8
> So, we're back to the original issue on systems with multiple UPI/QPI domains
> when NUMA is disabled.
>
> Systems with multiple UPI/QPI domains can't use source IDs to map devices
> to sockets. skx_get_src_id() will successfully read the source ID from PCI
> configuration space registers but it might not map to the correct socket because
> each UPI/QPI domain has identical repeating source IDs.
>
> For example, 8 sockets with 2 UPI/QPI domains:
>
> Socket 0 -> Source ID 0
> Socket 1 -> Source ID 1
> Socket 2 -> Source ID 2
> Socket 3 -> Source ID 3
> Socket 4 -> Source ID 0
> Socket 5 -> Source ID 1
> Socket 6 -> Source ID 2
> Socket 7 -> Source ID 3
>
> EDAC will successfully load, but it will not find the the corresponding device
> for errors on sockets 4 though 7 (for example, see skx_common.c:178).
>
> Looking at my original patch, EDAC will not load when a system has multiple UPI/
> QPI domains and NUMA is disabled. We fail early with "Failed to get package ID
> from NUMA information" instead of later when an error occurs.

Looks like there are four, mostly orthogonal, configuration possibilities:

1) System: Clumps vs. no-clumps
2) BIOS: Option to pick UMA vs. NUMA description to OS
3) Kernel configuration: CONFIG_NUMA=y vs. "not set"
4) Kernel x86 boot param: numa=off (or noacpi,nohmat) vs. no option

Maybe not all permutations make sense? '2' and '4' may be effectively the same?

Can this be solved for systems that have clumps, but Linux is ignoring NUMA-ness?

-Tony
Luck, Tony Dec. 6, 2024, 10:09 p.m. UTC | #9
[+Bjorn]

What we need here is a function that maps from a PCIe device to a CPU socket.

Has this problem been encountered before? Is there an existing solution?

-Tony
Bjorn Helgaas Dec. 10, 2024, 4:37 p.m. UTC | #10
On Fri, Dec 06, 2024 at 10:09:23PM +0000, Luck, Tony wrote:
> What we need here is a function that maps from a PCIe device to a CPU socket.
> 
> Has this problem been encountered before? Is there an existing solution?

There's nothing in PCI itself that connects a device to a CPU.  It
sounds like something that might fit with an ACPI NUMA description,
e.g., if a CPU and a PCI host bridge had the same ACPI _PXM value, you
could conclude that the devices below the host bridge are close to the
CPU.

Bjorn
Luck, Tony Dec. 10, 2024, 5:50 p.m. UTC | #11
> > What we need here is a function that maps from a PCIe device to a CPU socket.
> >
> > Has this problem been encountered before? Is there an existing solution?
>
> There's nothing in PCI itself that connects a device to a CPU.  It
> sounds like something that might fit with an ACPI NUMA description,
> e.g., if a CPU and a PCI host bridge had the same ACPI _PXM value, you
> could conclude that the devices below the host bridge are close to the
> CPU.

Bjorn,

Thanks for looking. Kyle already has code that does an ACPI NUMA lookup.

But that doesn't work on system where Linux is compiled with CONFIG_NUMA=n,
booted with numa=off, or on a system where BIOS option for "Unified memory mode"
has been selected.

-Tony
diff mbox series

Patch

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 51556c72a967..59384677d025 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -1010,7 +1010,7 @@  static struct notifier_block i10nm_mce_dec = {
 
 static int __init i10nm_init(void)
 {
-	u8 mc = 0, src_id = 0, node_id = 0;
+	u8 mc = 0, src_id = 0;
 	const struct x86_cpu_id *id;
 	struct res_config *cfg;
 	const char *owner;
@@ -1018,6 +1018,7 @@  static int __init i10nm_init(void)
 	int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
 	u64 tolm, tohm;
 	int imc_num;
+	int dup_src_ids = 0;
 
 	edac_dbg(2, "\n");
 
@@ -1065,24 +1066,26 @@  static int __init i10nm_init(void)
 
 	imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;
 
-	list_for_each_entry(d, i10nm_edac_list, list) {
-		rc = skx_get_src_id(d, 0xf8, &src_id);
-		if (rc < 0)
-			goto fail;
+	rc = dup_src_ids = skx_check_dup_src_ids(0xf8);
+	if (rc < 0)
+		goto fail;
 
-		rc = skx_get_node_id(d, &node_id);
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		if (dup_src_ids)
+			rc = skx_get_pkg_id(d, &src_id);
+		else
+			rc = skx_get_src_id(d, 0xf8, &src_id);
 		if (rc < 0)
 			goto fail;
 
-		edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id);
+		edac_dbg(2, "src_id = %d\n", src_id);
 		for (i = 0; i < imc_num; i++) {
 			if (!d->imc[i].mdev)
 				continue;
 
 			d->imc[i].mc  = mc++;
 			d->imc[i].lmc = i;
-			d->imc[i].src_id  = src_id;
-			d->imc[i].node_id = node_id;
+			d->imc[i].src_id = src_id;
 			if (d->imc[i].hbm_mc) {
 				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
 				d->imc[i].num_channels = cfg->hbm_chan_num;
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 14cfd394b469..189b8c5a1bda 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -600,8 +600,9 @@  static int __init skx_init(void)
 	const struct munit *m;
 	const char *owner;
 	int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
-	u8 mc = 0, src_id, node_id;
+	u8 mc = 0, src_id;
 	struct skx_dev *d;
+	int dup_src_ids = 0;
 
 	edac_dbg(2, "\n");
 
@@ -646,19 +647,23 @@  static int __init skx_init(void)
 		}
 	}
 
+	rc = dup_src_ids = skx_check_dup_src_ids(0xf0);
+	if (rc < 0)
+		goto fail;
+
 	list_for_each_entry(d, skx_edac_list, list) {
-		rc = skx_get_src_id(d, 0xf0, &src_id);
-		if (rc < 0)
-			goto fail;
-		rc = skx_get_node_id(d, &node_id);
+		if (dup_src_ids)
+			rc = skx_get_pkg_id(d, &src_id);
+		else
+			rc = skx_get_src_id(d, 0xf0, &src_id);
 		if (rc < 0)
 			goto fail;
-		edac_dbg(2, "src_id=%d node_id=%d\n", src_id, node_id);
+
+		edac_dbg(2, "src_id = %d\n", src_id);
 		for (i = 0; i < SKX_NUM_IMC; i++) {
 			d->imc[i].mc = mc++;
 			d->imc[i].lmc = i;
 			d->imc[i].src_id = src_id;
-			d->imc[i].node_id = node_id;
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
 					      skx_get_dimm_config, cfg);
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 6cf17af7d911..56fec7310f40 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -235,19 +235,55 @@  int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
 }
 EXPORT_SYMBOL_GPL(skx_get_src_id);
 
-int skx_get_node_id(struct skx_dev *d, u8 *id)
+int skx_check_dup_src_ids(int off)
 {
-	u32 reg;
+	u8 id;
+	struct skx_dev *d;
+	int rc;
+	DECLARE_BITMAP(id_map, 8);
 
-	if (pci_read_config_dword(d->util_all, 0xf4, &reg)) {
-		skx_printk(KERN_ERR, "Failed to read node id\n");
-		return -ENODEV;
+	bitmap_zero(id_map, 8);
+
+	/*
+	 * The 3-bit source IDs in PCI configuration space registers are limited
+	 * to 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
+	 */
+	list_for_each_entry(d, &dev_edac_list, list) {
+		rc = skx_get_src_id(d, off, &id);
+		if (rc < 0)
+			return rc;
+
+		if (test_bit(id, id_map))
+			return 1;
+
+		set_bit(id, id_map);
 	}
 
-	*id = GET_BITFIELD(reg, 0, 2);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(skx_get_node_id);
+EXPORT_SYMBOL_GPL(skx_check_dup_src_ids);
+
+int skx_get_pkg_id(struct skx_dev *d, u8 *id)
+{
+	int node;
+	int cpu;
+
+	node = pcibus_to_node(d->util_all->bus);
+	if (numa_valid_node(node)) {
+		for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) {
+			struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+			if (c->initialized && cpu_to_node(cpu) == node) {
+				*id = c->topo.pkg_id;
+				return 0;
+			}
+		}
+	}
+
+	skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(skx_get_pkg_id);
 
 static int get_width(u32 mtr)
 {
@@ -507,7 +543,7 @@  int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 	pvt->imc = imc;
 
 	mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name,
-				  imc->node_id, imc->lmc);
+				  imc->src_id, imc->lmc);
 	if (!mci->ctl_name) {
 		rc = -ENOMEM;
 		goto fail0;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 54bba8a62f72..0f06d45c9b3e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -103,7 +103,7 @@  struct skx_dev {
 		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
-		u8 src_id, node_id;
+		u8 src_id;
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
@@ -244,7 +244,8 @@  void skx_set_mem_cfg(bool mem_cfg_2lm);
 void skx_set_res_cfg(struct res_config *cfg);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
-int skx_get_node_id(struct skx_dev *d, u8 *id);
+int skx_check_dup_src_ids(int off);
+int skx_get_pkg_id(struct skx_dev *d, u8 *id);
 
 int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);