diff mbox series

ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT

Message ID 20211009015339.400383-1-alison.schofield@intel.com
State Superseded
Headers show
Series ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT | expand

Commit Message

Alison Schofield Oct. 9, 2021, 1:53 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

During NUMA init, CXL memory defined in the SRAT Memory Affinity
subtable may be assigned to a NUMA node. Since there is no
requirement that the SRAT be comprehensive for CXL memory another
mechanism is needed to assign NUMA nodes to CXL memory not identified
in the SRAT.

Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
NUMA node for each range that is not already assigned to a NUMA node.
Add a memblk attaching its host physical address range to the node.

Note that these ranges may not actually map any memory at boot time.
They may describe persistent capacity or may be present to enable
hot-plug.

Consumers can use phys_to_target_node() to discover the NUMA node.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/acpi.c       |  8 +++---
 2 files changed, 63 insertions(+), 3 deletions(-)

Comments

Alison Schofield Oct. 9, 2021, 2 a.m. UTC | #1
On Fri, Oct 08, 2021 at 06:53:39PM -0700, alison.schofield@intel.com wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> During NUMA init, CXL memory defined in the SRAT Memory Affinity
> subtable may be assigned to a NUMA node. Since there is no
> requirement that the SRAT be comprehensive for CXL memory another
> mechanism is needed to assign NUMA nodes to CXL memory not identified
> in the SRAT.
> 
> Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> NUMA node for each range that is not already assigned to a NUMA node.
> Add a memblk attaching its host physical address range to the node.
> 
> Note that these ranges may not actually map any memory at boot time.
> They may describe persistent capacity or may be present to enable
> hot-plug.
> 
> Consumers can use phys_to_target_node() to discover the NUMA node.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/acpi.c       |  8 +++---
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> index b8795fc49097..568e033e6c3f 100644
> --- a/drivers/acpi/numa/srat.c
> +++ b/drivers/acpi/numa/srat.c
> @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
>  }
>  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
>  
> +/* Add a NUMA node and memblk for each node-less CFMWS */
> +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> +{
> +	struct acpi_cedt_cfmws *cfmws;
> +	acpi_size len, cur = 0;
> +	void *cedt_subtable;
> +	int i, pxm, node;
> +	u64 start, end;
> +
> +	/* Use fake PXM values starting after the max PXM found in the SRAT */
> +	for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)

s/MAX_PXM_DOMAINS/MAX_NUMNODES
I'll fix that in a v2, but will await feedback before reving.

Alison

> +		if (node_to_pxm_map[i] > pxm)
> +			pxm = node_to_pxm_map[i];
> +	pxm++;
> +
> +	len = acpi_cedt->length - sizeof(*acpi_cedt);
> +	cedt_subtable = acpi_cedt + 1;
> +
> +	while (cur < len) {
> +		struct acpi_cedt_header *c = cedt_subtable + cur;
> +
> +		if (c->type != ACPI_CEDT_TYPE_CFMWS)
> +			goto next;
> +
> +		cfmws = cedt_subtable + cur;
> +		if (cfmws->header.length < sizeof(*cfmws)) {
> +			pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
> +				     cfmws->header.length);
> +			goto next;
> +		}
> +
> +		start = cfmws->base_hpa;
> +		end = cfmws->base_hpa + cfmws->window_size;
> +
> +		/* Skip if the HPA is already assigned to a NUMA node */
> +		node = phys_to_target_node(start);
> +		if (node != NUMA_NO_NODE)
> +			goto next;
> +
> +		node = acpi_map_pxm_to_node(pxm);
> +		if (node == NUMA_NO_NODE) {
> +			pr_err("ACPI NUMA: Too many proximity domains.\n");
> +			return -EINVAL;
> +		}
> +		if (numa_add_memblk(node, start, end) < 0) {
> +			pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> +				node, start, end);
> +		}
> +		pxm++;
> +next:
> +		cur += c->length;
> +	}
> +	return 0;
> +}
> +
>  static int __init acpi_parse_slit(struct acpi_table_header *table)
>  {
>  	struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
> @@ -478,6 +533,9 @@ int __init acpi_numa_init(void)
>  	/* SLIT: System Locality Information Table */
>  	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
>  
> +	/* CEDT: CXL Early Discovery Table */
> +	acpi_table_parse(ACPI_SIG_CEDT, acpi_parse_cfmws);
> +
>  	if (cnt < 0)
>  		return cnt;
>  	else if (!parsed_numa_memblks)
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index dadc7f64b9ff..3798841c3418 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -144,9 +144,11 @@ static void cxl_add_cfmws_decoders(struct device *dev,
>  				cfmws->window_size - 1);
>  			goto next;
>  		}
> -		dev_dbg(dev, "add: %s range %#llx-%#llx\n",
> -			dev_name(&cxld->dev), cfmws->base_hpa,
> -			cfmws->base_hpa + cfmws->window_size - 1);
> +		dev_dbg(dev, "add:%s node:%d range:%#llx-%#llx restrict:%#x\n",
> +			dev_name(&cxld->dev),
> +			phys_to_target_node(cxld->range.start),
> +			cxld->range.start, cxld->range.end,
> +			cfmws->restrictions);
>  next:
>  		cur += c->length;
>  	}
> -- 
> 2.31.1
>
kernel test robot Oct. 9, 2021, 3:56 a.m. UTC | #2
Hi,

I love your patch! Yet something to improve:

[auto build test ERROR on next-20211008]
[cannot apply to rafael-pm/linux-next linus/master v5.15-rc4 v5.15-rc3 v5.15-rc2 v5.15-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/alison-schofield-intel-com/ACPI-NUMA-Add-a-node-and-memblk-for-each-CFMWS-not-in-SRAT/20211009-094738
base:    683f29b781aeaab6bf302eeb2ef08a5e5f9d8a27
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/82ab0c41adad985aa21693dda86949d509ae14e4
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review alison-schofield-intel-com/ACPI-NUMA-Add-a-node-and-memblk-for-each-CFMWS-not-in-SRAT/20211009-094738
        git checkout 82ab0c41adad985aa21693dda86949d509ae14e4
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/acpi/numa/srat.c: In function 'acpi_parse_cfmws':
>> drivers/acpi/numa/srat.c:347:21: error: implicit declaration of function 'numa_add_memblk' [-Werror=implicit-function-declaration]
     347 |                 if (numa_add_memblk(node, start, end) < 0) {
         |                     ^~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/numa_add_memblk +347 drivers/acpi/numa/srat.c

   302	
   303	/* Add a NUMA node and memblk for each node-less CFMWS */
   304	static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
   305	{
   306		struct acpi_cedt_cfmws *cfmws;
   307		acpi_size len, cur = 0;
   308		void *cedt_subtable;
   309		int i, pxm, node;
   310		u64 start, end;
   311	
   312		/* Use fake PXM values starting after the max PXM found in the SRAT */
   313		for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
   314			if (node_to_pxm_map[i] > pxm)
   315				pxm = node_to_pxm_map[i];
   316		pxm++;
   317	
   318		len = acpi_cedt->length - sizeof(*acpi_cedt);
   319		cedt_subtable = acpi_cedt + 1;
   320	
   321		while (cur < len) {
   322			struct acpi_cedt_header *c = cedt_subtable + cur;
   323	
   324			if (c->type != ACPI_CEDT_TYPE_CFMWS)
   325				goto next;
   326	
   327			cfmws = cedt_subtable + cur;
   328			if (cfmws->header.length < sizeof(*cfmws)) {
   329				pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
   330					     cfmws->header.length);
   331				goto next;
   332			}
   333	
   334			start = cfmws->base_hpa;
   335			end = cfmws->base_hpa + cfmws->window_size;
   336	
   337			/* Skip if the HPA is already assigned to a NUMA node */
   338			node = phys_to_target_node(start);
   339			if (node != NUMA_NO_NODE)
   340				goto next;
   341	
   342			node = acpi_map_pxm_to_node(pxm);
   343			if (node == NUMA_NO_NODE) {
   344				pr_err("ACPI NUMA: Too many proximity domains.\n");
   345				return -EINVAL;
   346			}
 > 347			if (numa_add_memblk(node, start, end) < 0) {
   348				pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
   349					node, start, end);
   350			}
   351			pxm++;
   352	next:
   353			cur += c->length;
   354		}
   355		return 0;
   356	}
   357	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 9, 2021, 1:23 p.m. UTC | #3
Hi,

I love your patch! Perhaps something to improve:

[auto build test WARNING on next-20211008]
[cannot apply to rafael-pm/linux-next linus/master v5.15-rc4 v5.15-rc3 v5.15-rc2 v5.15-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/alison-schofield-intel-com/ACPI-NUMA-Add-a-node-and-memblk-for-each-CFMWS-not-in-SRAT/20211009-094738
base:    683f29b781aeaab6bf302eeb2ef08a5e5f9d8a27
config: arm64-defconfig (attached as .config)
compiler: aarch64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/82ab0c41adad985aa21693dda86949d509ae14e4
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review alison-schofield-intel-com/ACPI-NUMA-Add-a-node-and-memblk-for-each-CFMWS-not-in-SRAT/20211009-094738
        git checkout 82ab0c41adad985aa21693dda86949d509ae14e4
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/acpi/numa/srat.c: In function 'acpi_parse_cfmws':
>> drivers/acpi/numa/srat.c:314:36: warning: iteration 16 invokes undefined behavior [-Waggressive-loop-optimizations]
     314 |                 if (node_to_pxm_map[i] > pxm)
         |                     ~~~~~~~~~~~~~~~^~~
   drivers/acpi/numa/srat.c:313:23: note: within this loop
     313 |         for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
         |                     ~~^~~~~~~~~~~~~~~~~~~~~


vim +314 drivers/acpi/numa/srat.c

   302	
   303	/* Add a NUMA node and memblk for each node-less CFMWS */
   304	static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
   305	{
   306		struct acpi_cedt_cfmws *cfmws;
   307		acpi_size len, cur = 0;
   308		void *cedt_subtable;
   309		int i, pxm, node;
   310		u64 start, end;
   311	
   312		/* Use fake PXM values starting after the max PXM found in the SRAT */
   313		for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
 > 314			if (node_to_pxm_map[i] > pxm)
   315				pxm = node_to_pxm_map[i];
   316		pxm++;
   317	
   318		len = acpi_cedt->length - sizeof(*acpi_cedt);
   319		cedt_subtable = acpi_cedt + 1;
   320	
   321		while (cur < len) {
   322			struct acpi_cedt_header *c = cedt_subtable + cur;
   323	
   324			if (c->type != ACPI_CEDT_TYPE_CFMWS)
   325				goto next;
   326	
   327			cfmws = cedt_subtable + cur;
   328			if (cfmws->header.length < sizeof(*cfmws)) {
   329				pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
   330					     cfmws->header.length);
   331				goto next;
   332			}
   333	
   334			start = cfmws->base_hpa;
   335			end = cfmws->base_hpa + cfmws->window_size;
   336	
   337			/* Skip if the HPA is already assigned to a NUMA node */
   338			node = phys_to_target_node(start);
   339			if (node != NUMA_NO_NODE)
   340				goto next;
   341	
   342			node = acpi_map_pxm_to_node(pxm);
   343			if (node == NUMA_NO_NODE) {
   344				pr_err("ACPI NUMA: Too many proximity domains.\n");
   345				return -EINVAL;
   346			}
   347			if (numa_add_memblk(node, start, end) < 0) {
   348				pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
   349					node, start, end);
   350			}
   351			pxm++;
   352	next:
   353			cur += c->length;
   354		}
   355		return 0;
   356	}
   357	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Ira Weiny Oct. 11, 2021, 5:13 p.m. UTC | #4
On Fri, Oct 08, 2021 at 06:53:39PM -0700, Schofield, Alison wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> During NUMA init, CXL memory defined in the SRAT Memory Affinity
> subtable may be assigned to a NUMA node. Since there is no
> requirement that the SRAT be comprehensive for CXL memory another
> mechanism is needed to assign NUMA nodes to CXL memory not identified
> in the SRAT.
> 
> Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> NUMA node for each range that is not already assigned to a NUMA node.
> Add a memblk attaching its host physical address range to the node.
> 
> Note that these ranges may not actually map any memory at boot time.
> They may describe persistent capacity or may be present to enable
> hot-plug.
> 
> Consumers can use phys_to_target_node() to discover the NUMA node.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/acpi.c       |  8 +++---
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> index b8795fc49097..568e033e6c3f 100644
> --- a/drivers/acpi/numa/srat.c
> +++ b/drivers/acpi/numa/srat.c
> @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
>  }
>  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
>  
> +/* Add a NUMA node and memblk for each node-less CFMWS */

IMO this comment does not make sense for a function called 'parse cfmws'.  I
would just drop it.

> +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> +{
> +	struct acpi_cedt_cfmws *cfmws;
> +	acpi_size len, cur = 0;
> +	void *cedt_subtable;
> +	int i, pxm, node;
> +	u64 start, end;
> +
> +	/* Use fake PXM values starting after the max PXM found in the SRAT */
> +	for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
> +		if (node_to_pxm_map[i] > pxm)
> +			pxm = node_to_pxm_map[i];
> +	pxm++;
> +
> +	len = acpi_cedt->length - sizeof(*acpi_cedt);
> +	cedt_subtable = acpi_cedt + 1;
> +
> +	while (cur < len) {
> +		struct acpi_cedt_header *c = cedt_subtable + cur;
> +
> +		if (c->type != ACPI_CEDT_TYPE_CFMWS)
> +			goto next;
> +
> +		cfmws = cedt_subtable + cur;
> +		if (cfmws->header.length < sizeof(*cfmws)) {
> +			pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
> +				     cfmws->header.length);
> +			goto next;
> +		}
> +
> +		start = cfmws->base_hpa;
> +		end = cfmws->base_hpa + cfmws->window_size;
> +
> +		/* Skip if the HPA is already assigned to a NUMA node */
> +		node = phys_to_target_node(start);
> +		if (node != NUMA_NO_NODE)
> +			goto next;
> +
> +		node = acpi_map_pxm_to_node(pxm);
> +		if (node == NUMA_NO_NODE) {
> +			pr_err("ACPI NUMA: Too many proximity domains.\n");
> +			return -EINVAL;
> +		}
> +		if (numa_add_memblk(node, start, end) < 0) {
> +			pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> +				node, start, end);

Is there anything which could go wrong if the block is not added to the numa
node?

Ira

> +		}
> +		pxm++;
> +next:
> +		cur += c->length;
> +	}
> +	return 0;
> +}
> +
>  static int __init acpi_parse_slit(struct acpi_table_header *table)
>  {
>  	struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
> @@ -478,6 +533,9 @@ int __init acpi_numa_init(void)
>  	/* SLIT: System Locality Information Table */
>  	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
>  
> +	/* CEDT: CXL Early Discovery Table */
> +	acpi_table_parse(ACPI_SIG_CEDT, acpi_parse_cfmws);
> +
>  	if (cnt < 0)
>  		return cnt;
>  	else if (!parsed_numa_memblks)
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index dadc7f64b9ff..3798841c3418 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -144,9 +144,11 @@ static void cxl_add_cfmws_decoders(struct device *dev,
>  				cfmws->window_size - 1);
>  			goto next;
>  		}
> -		dev_dbg(dev, "add: %s range %#llx-%#llx\n",
> -			dev_name(&cxld->dev), cfmws->base_hpa,
> -			cfmws->base_hpa + cfmws->window_size - 1);
> +		dev_dbg(dev, "add:%s node:%d range:%#llx-%#llx restrict:%#x\n",
> +			dev_name(&cxld->dev),
> +			phys_to_target_node(cxld->range.start),
> +			cxld->range.start, cxld->range.end,
> +			cfmws->restrictions);
>  next:
>  		cur += c->length;
>  	}
> -- 
> 2.31.1
>
Alison Schofield Oct. 11, 2021, 10 p.m. UTC | #5
Thanks for the review Ira!

On Mon, Oct 11, 2021 at 10:13:16AM -0700, Ira Weiny wrote:
> On Fri, Oct 08, 2021 at 06:53:39PM -0700, Schofield, Alison wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > subtable may be assigned to a NUMA node. Since there is no
> > requirement that the SRAT be comprehensive for CXL memory another
> > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > in the SRAT.
> > 
> > Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> > Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> > NUMA node for each range that is not already assigned to a NUMA node.
> > Add a memblk attaching its host physical address range to the node.
> > 
> > Note that these ranges may not actually map any memory at boot time.
> > They may describe persistent capacity or may be present to enable
> > hot-plug.
> > 
> > Consumers can use phys_to_target_node() to discover the NUMA node.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
> >  drivers/cxl/acpi.c       |  8 +++---
> >  2 files changed, 63 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> > index b8795fc49097..568e033e6c3f 100644
> > --- a/drivers/acpi/numa/srat.c
> > +++ b/drivers/acpi/numa/srat.c
> > @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
> >  }
> >  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
> >  
> > +/* Add a NUMA node and memblk for each node-less CFMWS */
> 
> IMO this comment does not make sense for a function called 'parse cfmws'.  I
> would just drop it.
> 

Agree. Dropping comment. 

You made me stare at this func name more, and it can be better.
I'll try this: acpi_cxl_cfmws_init()
to be more like the others in this file, doing similar work.

> > +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> > +{

snip


> > +		if (numa_add_memblk(node, start, end) < 0) {
> > +			pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> > +				node, start, end);
> 
> Is there anything which could go wrong if the block is not added to the numa
> node?
> 
> Ira
> 

I intended, but failed, to mimic the adding of nodes & memblocks based
on the SRAT Memory Affinity table. There, a failure from numa_add_memblk()
causes the entire SRAT parsing to fail, and IIUC entire acpi_numa init fails.

FYI: numa_add_memblk() only completely fails (-EINVAL) if we've used up all
the NR_NODE_MEMBLKS (which is set to MAX_NUMNODE*2)

My first guess would be to follow the pattern and fail the entire
acpi_numa init. ie make acpi_numa = -1; I'll pursue that.

Now, back to your specific question: "Is there anything which could
go wrong if the block is not added to the numa node?"

numa_add_memblk(_to) can actually return success without adding a memblock.
Like this:
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
        pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
                nid, start, end - 1);
        return 0;
}

So, I'm going to make an assumption that it can be handled, and see if
someone else chimes in here with more knowledge of why we can quietly
ignore.

Alison

>
>
snip
Dan Williams Oct. 13, 2021, 11:18 p.m. UTC | #6
On Fri, Oct 8, 2021 at 6:45 PM <alison.schofield@intel.com> wrote:
>
> From: Alison Schofield <alison.schofield@intel.com>
>
> During NUMA init, CXL memory defined in the SRAT Memory Affinity
> subtable may be assigned to a NUMA node. Since there is no
> requirement that the SRAT be comprehensive for CXL memory another
> mechanism is needed to assign NUMA nodes to CXL memory not identified
> in the SRAT.
>
> Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> NUMA node for each range that is not already assigned to a NUMA node.
> Add a memblk attaching its host physical address range to the node.
>
> Note that these ranges may not actually map any memory at boot time.
> They may describe persistent capacity or may be present to enable
> hot-plug.
>
> Consumers can use phys_to_target_node() to discover the NUMA node.
>
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/acpi.c       |  8 +++---
>  2 files changed, 63 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> index b8795fc49097..568e033e6c3f 100644
> --- a/drivers/acpi/numa/srat.c
> +++ b/drivers/acpi/numa/srat.c
> @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
>  }
>  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
>
> +/* Add a NUMA node and memblk for each node-less CFMWS */
> +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> +{
> +       struct acpi_cedt_cfmws *cfmws;
> +       acpi_size len, cur = 0;
> +       void *cedt_subtable;
> +       int i, pxm, node;
> +       u64 start, end;
> +
> +       /* Use fake PXM values starting after the max PXM found in the SRAT */
> +       for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
> +               if (node_to_pxm_map[i] > pxm)
> +                       pxm = node_to_pxm_map[i];
> +       pxm++;
> +
> +       len = acpi_cedt->length - sizeof(*acpi_cedt);
> +       cedt_subtable = acpi_cedt + 1;
> +
> +       while (cur < len) {
> +               struct acpi_cedt_header *c = cedt_subtable + cur;
> +
> +               if (c->type != ACPI_CEDT_TYPE_CFMWS)
> +                       goto next;
> +
> +               cfmws = cedt_subtable + cur;
> +               if (cfmws->header.length < sizeof(*cfmws)) {
> +                       pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
> +                                    cfmws->header.length);
> +                       goto next;
> +               }
> +
> +               start = cfmws->base_hpa;
> +               end = cfmws->base_hpa + cfmws->window_size;
> +
> +               /* Skip if the HPA is already assigned to a NUMA node */

Per below note about SRAT parsing dependency, perhaps change this to:

/* Skip if the SRAT already described the NUMA details for this HPA */


> +               node = phys_to_target_node(start);
> +               if (node != NUMA_NO_NODE)
> +                       goto next;
> +
> +               node = acpi_map_pxm_to_node(pxm);
> +               if (node == NUMA_NO_NODE) {
> +                       pr_err("ACPI NUMA: Too many proximity domains.\n");

I think this might be too harsh especially for nodes that are only
there to support potential future CXL hotplug.

Let's just report and continue for now because the CXL driver itself
can decide on a fallback node to hot-add CXL memory that the BIOS did
not setup.

> +                       return -EINVAL;
> +               }
> +               if (numa_add_memblk(node, start, end) < 0) {
> +                       pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> +                               node, start, end);

I think this error message is most likely to trigger in the case that
MAX_NUMNODES is exhausted, so the address range information should be
reported in the acpi_map_pxm_to_node() failure case.

> +               }
> +               pxm++;
> +next:
> +               cur += c->length;
> +       }
> +       return 0;
> +}
> +
>  static int __init acpi_parse_slit(struct acpi_table_header *table)
>  {
>         struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
> @@ -478,6 +533,9 @@ int __init acpi_numa_init(void)
>         /* SLIT: System Locality Information Table */
>         acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
>
> +       /* CEDT: CXL Early Discovery Table */

It might be worth a comment here that this *must* come after SRAT
parsing so that the CXL ranges that the BIOS already incorporated into
the system memory map can be skipped.

> +       acpi_table_parse(ACPI_SIG_CEDT, acpi_parse_cfmws);
> +
>         if (cnt < 0)
>                 return cnt;
>         else if (!parsed_numa_memblks)
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index dadc7f64b9ff..3798841c3418 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -144,9 +144,11 @@ static void cxl_add_cfmws_decoders(struct device *dev,
>                                 cfmws->window_size - 1);
>                         goto next;
>                 }
> -               dev_dbg(dev, "add: %s range %#llx-%#llx\n",
> -                       dev_name(&cxld->dev), cfmws->base_hpa,
> -                       cfmws->base_hpa + cfmws->window_size - 1);
> +               dev_dbg(dev, "add:%s node:%d range:%#llx-%#llx restrict:%#x\n",
> +                       dev_name(&cxld->dev),
> +                       phys_to_target_node(cxld->range.start),
> +                       cxld->range.start, cxld->range.end,
> +                       cfmws->restrictions);
>  next:
>                 cur += c->length;
>         }
> --
> 2.31.1
>
Dan Williams Oct. 14, 2021, 12:42 a.m. UTC | #7
On Mon, Oct 11, 2021 at 2:52 PM Alison Schofield
<alison.schofield@intel.com> wrote:
>
> Thanks for the review Ira!
>
> On Mon, Oct 11, 2021 at 10:13:16AM -0700, Ira Weiny wrote:
> > On Fri, Oct 08, 2021 at 06:53:39PM -0700, Schofield, Alison wrote:
> > > From: Alison Schofield <alison.schofield@intel.com>
> > >
> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > subtable may be assigned to a NUMA node. Since there is no
> > > requirement that the SRAT be comprehensive for CXL memory another
> > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > in the SRAT.
> > >
> > > Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> > > Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> > > NUMA node for each range that is not already assigned to a NUMA node.
> > > Add a memblk attaching its host physical address range to the node.
> > >
> > > Note that these ranges may not actually map any memory at boot time.
> > > They may describe persistent capacity or may be present to enable
> > > hot-plug.
> > >
> > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > >
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > ---
> > >  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
> > >  drivers/cxl/acpi.c       |  8 +++---
> > >  2 files changed, 63 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> > > index b8795fc49097..568e033e6c3f 100644
> > > --- a/drivers/acpi/numa/srat.c
> > > +++ b/drivers/acpi/numa/srat.c
> > > @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
> > >  }
> > >  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
> > >
> > > +/* Add a NUMA node and memblk for each node-less CFMWS */
> >
> > IMO this comment does not make sense for a function called 'parse cfmws'.  I
> > would just drop it.
> >
>
> Agree. Dropping comment.
>
> You made me stare at this func name more, and it can be better.
> I'll try this: acpi_cxl_cfmws_init()
> to be more like the others in this file, doing similar work.
>
> > > +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> > > +{
>
> snip
>
>
> > > +           if (numa_add_memblk(node, start, end) < 0) {
> > > +                   pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> > > +                           node, start, end);
> >
> > Is there anything which could go wrong if the block is not added to the numa
> > node?
> >
> > Ira
> >
>
> I intended, but failed, to mimic the adding of nodes & memblocks based
> on the SRAT Memory Affinity table. There, a failure from numa_add_memblk()
> causes the entire SRAT parsing to fail, and IIUC entire acpi_numa init fails.
>
> FYI: numa_add_memblk() only completely fails (-EINVAL) if we've used up all
> the NR_NODE_MEMBLKS (which is set to MAX_NUMNODE*2)

If you're bumping up against that limit then this would have already
failed above at the acpi_map_pxm_to_node() call, right?

> My first guess would be to follow the pattern and fail the entire
> acpi_numa init. ie make acpi_numa = -1; I'll pursue that.

I think CFMWS ranges for future hotplug are in a different class than
memory currently present and marked online by the SRAT. So as long as
this implementation can determine that the range is for future
hotplug, which I believe the phys_to_target_node() pre-check handles,
then I think it is safe to note the failure and continue. With the
expectation that the CXL driver needs to handle the NUMA_NO_NODE case
at memory hot-add time.

> Now, back to your specific question: "Is there anything which could
> go wrong if the block is not added to the numa node?"
>
> numa_add_memblk(_to) can actually return success without adding a memblock.
> Like this:
> /* whine about and ignore invalid blks */
> if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
>         pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
>                 nid, start, end - 1);
>         return 0;
> }
>
> So, I'm going to make an assumption that it can be handled, and see if
> someone else chimes in here with more knowledge of why we can quietly
> ignore.

I wouldn't say quietly ignore as the failure here means that the
system cannot accurately place CXL memory for best performance.
There's already so many other ways that CXL memory dynamic setup can
go sideways that the lack of a dedicated NUMA node is not necessarily
fatal... in my opinion. It is a judgment call, but I think Robustness
Principle nudges it towards: "continue and try to make the best of
it".
Jonathan Cameron Oct. 15, 2021, 4:59 p.m. UTC | #8
On Fri, 8 Oct 2021 18:53:39 -0700
<alison.schofield@intel.com> wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> During NUMA init, CXL memory defined in the SRAT Memory Affinity
> subtable may be assigned to a NUMA node. Since there is no
> requirement that the SRAT be comprehensive for CXL memory another
> mechanism is needed to assign NUMA nodes to CXL memory not identified
> in the SRAT.
> 
> Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> NUMA node for each range that is not already assigned to a NUMA node.
> Add a memblk attaching its host physical address range to the node.
> 
> Note that these ranges may not actually map any memory at boot time.
> They may describe persistent capacity or may be present to enable
> hot-plug.
> 
> Consumers can use phys_to_target_node() to discover the NUMA node.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Hi Alison,

I'm not sure that a CFMWS entry should map to a single NUMA node...

Each entry corresponds to a contiguous HPA range into which CXL devices
below a set of ports (if interleaved) or one port should be mapped.

That could be multiple devices, each with it's own performance characteristics,
or potentially a mix of persistent and volatile memory on a system with limited
qtg groups.

Maybe it's the best we can do though given information available
before any devices are present.

Jonathan

> ---
>  drivers/acpi/numa/srat.c | 58 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/acpi.c       |  8 +++---
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> index b8795fc49097..568e033e6c3f 100644
> --- a/drivers/acpi/numa/srat.c
> +++ b/drivers/acpi/numa/srat.c
> @@ -300,6 +300,61 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
>  }
>  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
>  
> +/* Add a NUMA node and memblk for each node-less CFMWS */
> +static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
> +{
> +	struct acpi_cedt_cfmws *cfmws;
> +	acpi_size len, cur = 0;
> +	void *cedt_subtable;
> +	int i, pxm, node;
> +	u64 start, end;
> +
> +	/* Use fake PXM values starting after the max PXM found in the SRAT */
> +	for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
> +		if (node_to_pxm_map[i] > pxm)
> +			pxm = node_to_pxm_map[i];
> +	pxm++;
> +
> +	len = acpi_cedt->length - sizeof(*acpi_cedt);
> +	cedt_subtable = acpi_cedt + 1;
> +
> +	while (cur < len) {
> +		struct acpi_cedt_header *c = cedt_subtable + cur;
> +
> +		if (c->type != ACPI_CEDT_TYPE_CFMWS)
> +			goto next;
> +
> +		cfmws = cedt_subtable + cur;
> +		if (cfmws->header.length < sizeof(*cfmws)) {
> +			pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
> +				     cfmws->header.length);
> +			goto next;
> +		}
> +
> +		start = cfmws->base_hpa;
> +		end = cfmws->base_hpa + cfmws->window_size;
> +
> +		/* Skip if the HPA is already assigned to a NUMA node */
> +		node = phys_to_target_node(start);
> +		if (node != NUMA_NO_NODE)
> +			goto next;
> +
> +		node = acpi_map_pxm_to_node(pxm);
> +		if (node == NUMA_NO_NODE) {
> +			pr_err("ACPI NUMA: Too many proximity domains.\n");
> +			return -EINVAL;
> +		}
> +		if (numa_add_memblk(node, start, end) < 0) {
> +			pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> +				node, start, end);
> +		}
> +		pxm++;
> +next:
> +		cur += c->length;
> +	}
> +	return 0;
> +}
> +
>  static int __init acpi_parse_slit(struct acpi_table_header *table)
>  {
>  	struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
> @@ -478,6 +533,9 @@ int __init acpi_numa_init(void)
>  	/* SLIT: System Locality Information Table */
>  	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
>  
> +	/* CEDT: CXL Early Discovery Table */
> +	acpi_table_parse(ACPI_SIG_CEDT, acpi_parse_cfmws);
> +
>  	if (cnt < 0)
>  		return cnt;
>  	else if (!parsed_numa_memblks)
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index dadc7f64b9ff..3798841c3418 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -144,9 +144,11 @@ static void cxl_add_cfmws_decoders(struct device *dev,
>  				cfmws->window_size - 1);
>  			goto next;
>  		}
> -		dev_dbg(dev, "add: %s range %#llx-%#llx\n",
> -			dev_name(&cxld->dev), cfmws->base_hpa,
> -			cfmws->base_hpa + cfmws->window_size - 1);
> +		dev_dbg(dev, "add:%s node:%d range:%#llx-%#llx restrict:%#x\n",
> +			dev_name(&cxld->dev),
> +			phys_to_target_node(cxld->range.start),
> +			cxld->range.start, cxld->range.end,
> +			cfmws->restrictions);
>  next:
>  		cur += c->length;
>  	}
Dan Williams Oct. 15, 2021, 6:58 p.m. UTC | #9
On Fri, Oct 15, 2021 at 10:00 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Fri, 8 Oct 2021 18:53:39 -0700
> <alison.schofield@intel.com> wrote:
>
> > From: Alison Schofield <alison.schofield@intel.com>
> >
> > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > subtable may be assigned to a NUMA node. Since there is no
> > requirement that the SRAT be comprehensive for CXL memory another
> > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > in the SRAT.
> >
> > Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> > Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> > NUMA node for each range that is not already assigned to a NUMA node.
> > Add a memblk attaching its host physical address range to the node.
> >
> > Note that these ranges may not actually map any memory at boot time.
> > They may describe persistent capacity or may be present to enable
> > hot-plug.
> >
> > Consumers can use phys_to_target_node() to discover the NUMA node.
> >
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Hi Alison,
>
> I'm not sure that a CFMWS entry should map to a single NUMA node...
>
> Each entry corresponds to a contiguous HPA range into which CXL devices
> below a set of ports (if interleaved) or one port should be mapped.
>
> That could be multiple devices, each with it's own performance characteristics,
> or potentially a mix of persistent and volatile memory on a system with limited
> qtg groups.
>
> Maybe it's the best we can do though given information available
> before any devices are present.
>

Regardless of the performance of the individual devices they can only
map to one of the available CFMWS entries. So the maximum number of
degrees of freedom is one node per CFMWS. Now if you have only one
entry to pick from, but have interleave sets with widely different
performance characteristics to online it becomes a policy decision
about whether to force map those interleave sets into the same node,
and that policy can be maintained outside the kernel.

The alternative is to rework NUMA nodes to be something that can be
declared dynamically as currently there are assumptions throughout the
kernel that num_possible_nodes() is statically determined early in
boot. I am not seeing strong evidence that complexity needs to be
tackled in the near term, and "NUMA-node per CFMWS" should (famous
last words) serve CXL needs for the foreseeable future.
Jonathan Cameron Oct. 18, 2021, 9:25 a.m. UTC | #10
On Fri, 15 Oct 2021 11:58:36 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Fri, Oct 15, 2021 at 10:00 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Fri, 8 Oct 2021 18:53:39 -0700
> > <alison.schofield@intel.com> wrote:
> >  
> > > From: Alison Schofield <alison.schofield@intel.com>
> > >
> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > subtable may be assigned to a NUMA node. Since there is no
> > > requirement that the SRAT be comprehensive for CXL memory another
> > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > in the SRAT.
> > >
> > > Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> > > Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> > > NUMA node for each range that is not already assigned to a NUMA node.
> > > Add a memblk attaching its host physical address range to the node.
> > >
> > > Note that these ranges may not actually map any memory at boot time.
> > > They may describe persistent capacity or may be present to enable
> > > hot-plug.
> > >
> > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > >
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>  
> > Hi Alison,
> >
> > I'm not sure that a CFMWS entry should map to a single NUMA node...
> >
> > Each entry corresponds to a contiguous HPA range into which CXL devices
> > below a set of ports (if interleaved) or one port should be mapped.
> >
> > That could be multiple devices, each with it's own performance characteristics,
> > or potentially a mix of persistent and volatile memory on a system with limited
> > qtg groups.
> >
> > Maybe it's the best we can do though given information available
> > before any devices are present.
> >  
> 
> Regardless of the performance of the individual devices they can only
> map to one of the available CFMWS entries. So the maximum number of
> degrees of freedom is one node per CFMWS. Now if you have only one
> entry to pick from, but have interleave sets with widely different
> performance characteristics to online it becomes a policy decision
> about whether to force map those interleave sets into the same node,
> and that policy can be maintained outside the kernel.
> 
> The alternative is to rework NUMA nodes to be something that can be
> declared dynamically as currently there are assumptions throughout the
> kernel that num_possible_nodes() is statically determined early in
> boot. I am not seeing strong evidence that complexity needs to be
> tackled in the near term, and "NUMA-node per CFMWS" should (famous
> last words) serve CXL needs for the foreseeable future.

I'm less optimistic we won't end up revisiting this in the medium
term but can tackle that when we have better visibility of what
people are actually building.

Jonathan
Dan Williams Oct. 18, 2021, 6:15 p.m. UTC | #11
On Mon, Oct 18, 2021 at 2:25 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Fri, 15 Oct 2021 11:58:36 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Fri, Oct 15, 2021 at 10:00 AM Jonathan Cameron
> > <Jonathan.Cameron@huawei.com> wrote:
> > >
> > > On Fri, 8 Oct 2021 18:53:39 -0700
> > > <alison.schofield@intel.com> wrote:
> > >
> > > > From: Alison Schofield <alison.schofield@intel.com>
> > > >
> > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > > subtable may be assigned to a NUMA node. Since there is no
> > > > requirement that the SRAT be comprehensive for CXL memory another
> > > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > > in the SRAT.
> > > >
> > > > Use the CXL Fixed Memory Window Structure's (CFMWS) of the ACPI CXL
> > > > Early Discovery Table (CEDT) to find all CXL memory ranges. Create a
> > > > NUMA node for each range that is not already assigned to a NUMA node.
> > > > Add a memblk attaching its host physical address range to the node.
> > > >
> > > > Note that these ranges may not actually map any memory at boot time.
> > > > They may describe persistent capacity or may be present to enable
> > > > hot-plug.
> > > >
> > > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > > >
> > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > Hi Alison,
> > >
> > > I'm not sure that a CFMWS entry should map to a single NUMA node...
> > >
> > > Each entry corresponds to a contiguous HPA range into which CXL devices
> > > below a set of ports (if interleaved) or one port should be mapped.
> > >
> > > That could be multiple devices, each with it's own performance characteristics,
> > > or potentially a mix of persistent and volatile memory on a system with limited
> > > qtg groups.
> > >
> > > Maybe it's the best we can do though given information available
> > > before any devices are present.
> > >
> >
> > Regardless of the performance of the individual devices they can only
> > map to one of the available CFMWS entries. So the maximum number of
> > degrees of freedom is one node per CFMWS. Now if you have only one
> > entry to pick from, but have interleave sets with widely different
> > performance characteristics to online it becomes a policy decision
> > about whether to force map those interleave sets into the same node,
> > and that policy can be maintained outside the kernel.
> >
> > The alternative is to rework NUMA nodes to be something that can be
> > declared dynamically as currently there are assumptions throughout the
> > kernel that num_possible_nodes() is statically determined early in
> > boot. I am not seeing strong evidence that complexity needs to be
> > tackled in the near term, and "NUMA-node per CFMWS" should (famous
> > last words) serve CXL needs for the foreseeable future.
>
> I'm less optimistic we won't end up revisiting this in the medium
> term but can tackle that when we have better visibility of what
> people are actually building.

Agree. When we were game planning this patch internally the 2 options
were, build full support for defining new NUMA nodes after boot, or
just extend the boot-time NUMA node possibilities minimally by the
declared degrees of freedom in the CFMWS. The latter path was taken
because it gets us "80%" of what CXL needs without precluding going
the former path later if that remaining "20% proves critical to add
finer grained dynamic support.
diff mbox series

Patch

diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index b8795fc49097..568e033e6c3f 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -300,6 +300,61 @@  acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 }
 #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
 
+/* Add a NUMA node and memblk for each node-less CFMWS */
+static int __init acpi_parse_cfmws(struct acpi_table_header *acpi_cedt)
+{
+	struct acpi_cedt_cfmws *cfmws;
+	acpi_size len, cur = 0;
+	void *cedt_subtable;
+	int i, pxm, node;
+	u64 start, end;
+
+	/* Use fake PXM values starting after the max PXM found in the SRAT */
+	for (i = 0; i < MAX_PXM_DOMAINS - 1; i++)
+		if (node_to_pxm_map[i] > pxm)
+			pxm = node_to_pxm_map[i];
+	pxm++;
+
+	len = acpi_cedt->length - sizeof(*acpi_cedt);
+	cedt_subtable = acpi_cedt + 1;
+
+	while (cur < len) {
+		struct acpi_cedt_header *c = cedt_subtable + cur;
+
+		if (c->type != ACPI_CEDT_TYPE_CFMWS)
+			goto next;
+
+		cfmws = cedt_subtable + cur;
+		if (cfmws->header.length < sizeof(*cfmws)) {
+			pr_warn_once("CFMWS entry skipped:invalid length:%u\n",
+				     cfmws->header.length);
+			goto next;
+		}
+
+		start = cfmws->base_hpa;
+		end = cfmws->base_hpa + cfmws->window_size;
+
+		/* Skip if the HPA is already assigned to a NUMA node */
+		node = phys_to_target_node(start);
+		if (node != NUMA_NO_NODE)
+			goto next;
+
+		node = acpi_map_pxm_to_node(pxm);
+		if (node == NUMA_NO_NODE) {
+			pr_err("ACPI NUMA: Too many proximity domains.\n");
+			return -EINVAL;
+		}
+		if (numa_add_memblk(node, start, end) < 0) {
+			pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
+				node, start, end);
+		}
+		pxm++;
+next:
+		cur += c->length;
+	}
+	return 0;
+}
+
 static int __init acpi_parse_slit(struct acpi_table_header *table)
 {
 	struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
@@ -478,6 +533,9 @@  int __init acpi_numa_init(void)
 	/* SLIT: System Locality Information Table */
 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
+	/* CEDT: CXL Early Discovery Table */
+	acpi_table_parse(ACPI_SIG_CEDT, acpi_parse_cfmws);
+
 	if (cnt < 0)
 		return cnt;
 	else if (!parsed_numa_memblks)
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index dadc7f64b9ff..3798841c3418 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -144,9 +144,11 @@  static void cxl_add_cfmws_decoders(struct device *dev,
 				cfmws->window_size - 1);
 			goto next;
 		}
-		dev_dbg(dev, "add: %s range %#llx-%#llx\n",
-			dev_name(&cxld->dev), cfmws->base_hpa,
-			cfmws->base_hpa + cfmws->window_size - 1);
+		dev_dbg(dev, "add:%s node:%d range:%#llx-%#llx restrict:%#x\n",
+			dev_name(&cxld->dev),
+			phys_to_target_node(cxld->range.start),
+			cxld->range.start, cxld->range.end,
+			cfmws->restrictions);
 next:
 		cur += c->length;
 	}