diff mbox series

[6/6] ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT

Message ID 163553711933.2509508.2203471175679990.stgit@dwillia2-desk3.amr.corp.intel.com
State Accepted
Commit fd49f99c180996cef2d707ad71bee4f060dbe367
Headers show
Series Introduce acpi_table_parse_cedt and extra nodes for CXL.mem | expand

Commit Message

Dan Williams Oct. 29, 2021, 7:51 p.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

During NUMA init, CXL memory defined in the SRAT Memory Affinity
subtable may be assigned to a NUMA node. Since there is no
requirement that the SRAT be comprehensive for CXL memory another
mechanism is needed to assign NUMA nodes to CXL memory not identified
in the SRAT.

Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
Early Discovery Table (CEDT) to find all CXL memory ranges.
Create a NUMA node for each CFMWS that is not already assigned to
a NUMA node. Add a memblk attaching its host physical address
range to the node.

Note that these ranges may not actually map any memory at boot time.
They may describe persistent capacity or may be present to enable
hot-plug.

Consumers can use phys_to_target_node() to discover the NUMA node.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/numa/srat.c |   59 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/cxl/acpi.c       |    3 ++
 2 files changed, 60 insertions(+), 2 deletions(-)

Comments

Jonathan Cameron Nov. 18, 2021, 1:12 p.m. UTC | #1
On Fri, 29 Oct 2021 12:51:59 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> During NUMA init, CXL memory defined in the SRAT Memory Affinity
> subtable may be assigned to a NUMA node. Since there is no
> requirement that the SRAT be comprehensive for CXL memory another
> mechanism is needed to assign NUMA nodes to CXL memory not identified
> in the SRAT.
> 
> Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
> Early Discovery Table (CEDT) to find all CXL memory ranges.
> Create a NUMA node for each CFMWS that is not already assigned to
> a NUMA node. Add a memblk attaching its host physical address
> range to the node.
> 
> Note that these ranges may not actually map any memory at boot time.
> They may describe persistent capacity or may be present to enable
> hot-plug.
> 
> Consumers can use phys_to_target_node() to discover the NUMA node.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Hi,

I was just discussing this with one of our firmware / ACPI experts and he asked
an interesting question.   If you want to have CFMWS regions correspond to
new NUMA nodes, why not put them in SRAT as hotpluggable memory, but have none
present in the memory map (whichever route you use to get that)?
We do this for normal memory hotplug as (via the other discussion on qemu virtio-mem
nodes) apparently does qemu. 

https://lore.kernel.org/all/655c65af-fd7a-8007-37b3-a56c60a0ec5b@redhat.com/

This doesn't solve the question of whether we have enough nodes, but it's
not worse than if we use CFMWS regions and fits within existing ACPI spec.

The only reason I can immediately think of to not do this, is that it might be
a pain to later change over to dynamic numa node allocation in a fashion that
then ignores SRAT entries.  Probably a solvable problem.

Jonathan


> ---
>  drivers/acpi/numa/srat.c |   59 +++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/cxl/acpi.c       |    3 ++
>  2 files changed, 60 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
> index b8795fc49097..66a0142dc78c 100644
> --- a/drivers/acpi/numa/srat.c
> +++ b/drivers/acpi/numa/srat.c
> @@ -298,6 +298,47 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
>  out_err:
>  	return -EINVAL;
>  }
> +
> +static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
> +				   void *arg, const unsigned long table_end)
> +{
> +	struct acpi_cedt_cfmws *cfmws;
> +	int *fake_pxm = arg;
> +	u64 start, end;
> +	int node;
> +
> +	cfmws = (struct acpi_cedt_cfmws *)header;
> +	start = cfmws->base_hpa;
> +	end = cfmws->base_hpa + cfmws->window_size;
> +
> +	/* Skip if the SRAT already described the NUMA details for this HPA */
> +	node = phys_to_target_node(start);
> +	if (node != NUMA_NO_NODE)
> +		return 0;
> +
> +	node = acpi_map_pxm_to_node(*fake_pxm);
> +
> +	if (node == NUMA_NO_NODE) {
> +		pr_err("ACPI NUMA: Too many proximity domains while processing CFMWS.\n");
> +		return -EINVAL;
> +	}
> +
> +	if (numa_add_memblk(node, start, end) < 0) {
> +		/* CXL driver must handle the NUMA_NO_NODE case */
> +		pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
> +			node, start, end);
> +	}
> +
> +	/* Set the next available fake_pxm value */
> +	(*fake_pxm)++;
> +	return 0;
> +}
> +#else
> +static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
> +				   void *arg, const unsigned long table_end)
> +{
> +	return 0;
> +}
>  #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
>  
>  static int __init acpi_parse_slit(struct acpi_table_header *table)
> @@ -442,7 +483,7 @@ acpi_table_parse_srat(enum acpi_srat_type id,
>  
>  int __init acpi_numa_init(void)
>  {
> -	int cnt = 0;
> +	int i, fake_pxm, cnt = 0;
>  
>  	if (acpi_disabled)
>  		return -EINVAL;
> @@ -478,6 +519,22 @@ int __init acpi_numa_init(void)
>  	/* SLIT: System Locality Information Table */
>  	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
>  
> +	/*
> +	 * CXL Fixed Memory Window Structures (CFMWS) must be parsed
> +	 * after the SRAT. Create NUMA Nodes for CXL memory ranges that
> +	 * are defined in the CFMWS and not already defined in the SRAT.
> +	 * Initialize a fake_pxm as the first available PXM to emulate.
> +	 */
> +
> +	/* fake_pxm is the next unused PXM value after SRAT parsing */
> +	for (i = 0, fake_pxm = -1; i < MAX_NUMNODES - 1; i++) {
> +		if (node_to_pxm_map[i] > fake_pxm)
> +			fake_pxm = node_to_pxm_map[i];
> +	}
> +	fake_pxm++;
> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws,
> +			      &fake_pxm);
> +
>  	if (cnt < 0)
>  		return cnt;
>  	else if (!parsed_numa_memblks)
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index 91e4072e7649..3163167ecc3a 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -125,7 +125,8 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
>  			cfmws->base_hpa + cfmws->window_size - 1);
>  		return 0;
>  	}
> -	dev_dbg(dev, "add: %s range %#llx-%#llx\n", dev_name(&cxld->dev),
> +	dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n",
> +		dev_name(&cxld->dev), phys_to_target_node(cxld->range.start),
>  		cfmws->base_hpa, cfmws->base_hpa + cfmws->window_size - 1);
>  
>  	return 0;
>
Dan Williams Nov. 18, 2021, 5:14 p.m. UTC | #2
On Thu, Nov 18, 2021 at 5:12 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Fri, 29 Oct 2021 12:51:59 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > From: Alison Schofield <alison.schofield@intel.com>
> >
> > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > subtable may be assigned to a NUMA node. Since there is no
> > requirement that the SRAT be comprehensive for CXL memory another
> > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > in the SRAT.
> >
> > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
> > Early Discovery Table (CEDT) to find all CXL memory ranges.
> > Create a NUMA node for each CFMWS that is not already assigned to
> > a NUMA node. Add a memblk attaching its host physical address
> > range to the node.
> >
> > Note that these ranges may not actually map any memory at boot time.
> > They may describe persistent capacity or may be present to enable
> > hot-plug.
> >
> > Consumers can use phys_to_target_node() to discover the NUMA node.
> >
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Hi,
>
> I was just discussing this with one of our firmware / ACPI experts and he asked
> an interesting question.   If you want to have CFMWS regions correspond to
> new NUMA nodes, why not put them in SRAT as hotpluggable memory, but have none
> present in the memory map (whichever route you use to get that)?
> We do this for normal memory hotplug as (via the other discussion on qemu virtio-mem
> nodes) apparently does qemu.
>
> https://lore.kernel.org/all/655c65af-fd7a-8007-37b3-a56c60a0ec5b@redhat.com/
>
> This doesn't solve the question of whether we have enough nodes, but it's
> not worse than if we use CFMWS regions and fits within existing ACPI spec.
>
> The only reason I can immediately think of to not do this, is that it might be
> a pain to later change over to dynamic numa node allocation in a fashion that
> then ignores SRAT entries.  Probably a solvable problem.

Interesting, yes, that works for expanding the NUMA node number space.
However, if you populate SRAT what do you put in the corresponding
HMAT entries? In the case of dynamic CXL regions the driver is going
to generate the equivalent of the corresponding HMAT data based on
what devices it decides to place in that range. I actually do not know
what happens with HMAT today for memory hotplug, but I suspect there
are less degrees of freedom of what might populate those ranges than
what CXL allows, and there is a chance to pre-populate the HMAT for
future hotplug.

All that said, if an ACPI platform did do that population it would not
collide with the scheme proposed in this patch because this is
checking SRAT for the range before padding the proximity domain number
space for CFMWS entries.
Jonathan Cameron Nov. 18, 2021, 5:53 p.m. UTC | #3
On Thu, 18 Nov 2021 09:14:07 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> On Thu, Nov 18, 2021 at 5:12 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Fri, 29 Oct 2021 12:51:59 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > From: Alison Schofield <alison.schofield@intel.com>
> > >
> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > subtable may be assigned to a NUMA node. Since there is no
> > > requirement that the SRAT be comprehensive for CXL memory another
> > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > in the SRAT.
> > >
> > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
> > > Early Discovery Table (CEDT) to find all CXL memory ranges.
> > > Create a NUMA node for each CFMWS that is not already assigned to
> > > a NUMA node. Add a memblk attaching its host physical address
> > > range to the node.
> > >
> > > Note that these ranges may not actually map any memory at boot time.
> > > They may describe persistent capacity or may be present to enable
> > > hot-plug.
> > >
> > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > >
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > Hi,
> >
> > I was just discussing this with one of our firmware / ACPI experts and he asked
> > an interesting question.   If you want to have CFMWS regions correspond to
> > new NUMA nodes, why not put them in SRAT as hotpluggable memory, but have none
> > present in the memory map (whichever route you use to get that)?
> > We do this for normal memory hotplug as (via the other discussion on qemu virtio-mem
> > nodes) apparently does qemu.
> >
> > https://lore.kernel.org/all/655c65af-fd7a-8007-37b3-a56c60a0ec5b@redhat.com/
> >
> > This doesn't solve the question of whether we have enough nodes, but it's
> > not worse than if we use CFMWS regions and fits within existing ACPI spec.
> >
> > The only reason I can immediately think of to not do this, is that it might be
> > a pain to later change over to dynamic numa node allocation in a fashion that
> > then ignores SRAT entries.  Probably a solvable problem.  
> 
> Interesting, yes, that works for expanding the NUMA node number space.
> However, if you populate SRAT what do you put in the corresponding
> HMAT entries? In the case of dynamic CXL regions the driver is going
> to generate the equivalent of the corresponding HMAT data based on
> what devices it decides to place in that range. I actually do not know
> what happens with HMAT today for memory hotplug, but I suspect there
> are less degrees of freedom of what might populate those ranges than
> what CXL allows, and there is a chance to pre-populate the HMAT for
> future hotplug.

So... There are two answers to that question as I understand it.

1) What Linux does is nothing.  You get whatever was in HMAT to start with.
   Worth noting that HMAT doesn't need to be in any sense 'complete' so it
   is possible there was nothing there with a target in this NUMA node.

2) What ACPI intends to happen if anyone implements it.  There is an event
notification for this..

"Heterogeneous Memory Attributes Update. Dynamic reconfiguration of
the system may cause existing latency, bandwidth or memory side caching
attribute to change. The platform software issues the Heterogeneous
Memory Attributes Update notification to a point on a device tree to
indicate to OSPM that it needs to invoke the _HMA objects associated
with the Heterogeneous Memory Attributes on the device tree starting
from the point notified."

So call an AML method in DSDT for which ever device has a notification event.
A similar dance is actually implemented for NFIT updates and Linux
actually implements that handling for that one.

> 
> All that said, if an ACPI platform did do that population it would not
> collide with the scheme proposed in this patch because this is
> checking SRAT for the range before padding the proximity domain number
> space for CFMWS entries.

Good point and that probably answers our concern.

Jonathan
Dan Williams Nov. 18, 2021, 6:10 p.m. UTC | #4
On Thu, Nov 18, 2021 at 9:54 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Thu, 18 Nov 2021 09:14:07 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Thu, Nov 18, 2021 at 5:12 AM Jonathan Cameron
> > <Jonathan.Cameron@huawei.com> wrote:
> > >
> > > On Fri, 29 Oct 2021 12:51:59 -0700
> > > Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > From: Alison Schofield <alison.schofield@intel.com>
> > > >
> > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > > subtable may be assigned to a NUMA node. Since there is no
> > > > requirement that the SRAT be comprehensive for CXL memory another
> > > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > > in the SRAT.
> > > >
> > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
> > > > Early Discovery Table (CEDT) to find all CXL memory ranges.
> > > > Create a NUMA node for each CFMWS that is not already assigned to
> > > > a NUMA node. Add a memblk attaching its host physical address
> > > > range to the node.
> > > >
> > > > Note that these ranges may not actually map any memory at boot time.
> > > > They may describe persistent capacity or may be present to enable
> > > > hot-plug.
> > > >
> > > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > > >
> > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > Hi,
> > >
> > > I was just discussing this with one of our firmware / ACPI experts and he asked
> > > an interesting question.   If you want to have CFMWS regions correspond to
> > > new NUMA nodes, why not put them in SRAT as hotpluggable memory, but have none
> > > present in the memory map (whichever route you use to get that)?
> > > We do this for normal memory hotplug as (via the other discussion on qemu virtio-mem
> > > nodes) apparently does qemu.
> > >
> > > https://lore.kernel.org/all/655c65af-fd7a-8007-37b3-a56c60a0ec5b@redhat.com/
> > >
> > > This doesn't solve the question of whether we have enough nodes, but it's
> > > not worse than if we use CFMWS regions and fits within existing ACPI spec.
> > >
> > > The only reason I can immediately think of to not do this, is that it might be
> > > a pain to later change over to dynamic numa node allocation in a fashion that
> > > then ignores SRAT entries.  Probably a solvable problem.
> >
> > Interesting, yes, that works for expanding the NUMA node number space.
> > However, if you populate SRAT what do you put in the corresponding
> > HMAT entries? In the case of dynamic CXL regions the driver is going
> > to generate the equivalent of the corresponding HMAT data based on
> > what devices it decides to place in that range. I actually do not know
> > what happens with HMAT today for memory hotplug, but I suspect there
> > are less degrees of freedom of what might populate those ranges than
> > what CXL allows, and there is a chance to pre-populate the HMAT for
> > future hotplug.
>
> So... There are two answers to that question as I understand it.
>
> 1) What Linux does is nothing.  You get whatever was in HMAT to start with.
>    Worth noting that HMAT doesn't need to be in any sense 'complete' so it
>    is possible there was nothing there with a target in this NUMA node.
>
> 2) What ACPI intends to happen if anyone implements it.  There is an event
> notification for this..
>
> "Heterogeneous Memory Attributes Update. Dynamic reconfiguration of
> the system may cause existing latency, bandwidth or memory side caching
> attribute to change. The platform software issues the Heterogeneous
> Memory Attributes Update notification to a point on a device tree to
> indicate to OSPM that it needs to invoke the _HMA objects associated
> with the Heterogeneous Memory Attributes on the device tree starting
> from the point notified."
>
> So call an AML method in DSDT for which ever device has a notification event.
> A similar dance is actually implemented for NFIT updates and Linux
> actually implements that handling for that one.

Oh, yes, I know I helped implement it, and I think it is objectively a
terrible interface because you have no idea what changed or is allowed
to change. _HMA is slightly less problematic because it can't invent
new ranges, but I have low expectations that the BIOS would be able to
coordinate properly with the OS that has done the dynamic
re-configuration of the CXL topology. The OS could tell the BIOS what
to put in the HMAT, but might as well skip that step and just have the
OS populate its parsed copy of the HMAT data. I like your "just don't
publish HMAT for entries for these ranges" observation better.
Jonathan Cameron Nov. 18, 2021, 6:18 p.m. UTC | #5
On Thu, 18 Nov 2021 10:10:18 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> On Thu, Nov 18, 2021 at 9:54 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Thu, 18 Nov 2021 09:14:07 -0800
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > On Thu, Nov 18, 2021 at 5:12 AM Jonathan Cameron
> > > <Jonathan.Cameron@huawei.com> wrote:  
> > > >
> > > > On Fri, 29 Oct 2021 12:51:59 -0700
> > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > >  
> > > > > From: Alison Schofield <alison.schofield@intel.com>
> > > > >
> > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity
> > > > > subtable may be assigned to a NUMA node. Since there is no
> > > > > requirement that the SRAT be comprehensive for CXL memory another
> > > > > mechanism is needed to assign NUMA nodes to CXL memory not identified
> > > > > in the SRAT.
> > > > >
> > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL
> > > > > Early Discovery Table (CEDT) to find all CXL memory ranges.
> > > > > Create a NUMA node for each CFMWS that is not already assigned to
> > > > > a NUMA node. Add a memblk attaching its host physical address
> > > > > range to the node.
> > > > >
> > > > > Note that these ranges may not actually map any memory at boot time.
> > > > > They may describe persistent capacity or may be present to enable
> > > > > hot-plug.
> > > > >
> > > > > Consumers can use phys_to_target_node() to discover the NUMA node.
> > > > >
> > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > > > Hi,
> > > >
> > > > I was just discussing this with one of our firmware / ACPI experts and he asked
> > > > an interesting question.   If you want to have CFMWS regions correspond to
> > > > new NUMA nodes, why not put them in SRAT as hotpluggable memory, but have none
> > > > present in the memory map (whichever route you use to get that)?
> > > > We do this for normal memory hotplug as (via the other discussion on qemu virtio-mem
> > > > nodes) apparently does qemu.
> > > >
> > > > https://lore.kernel.org/all/655c65af-fd7a-8007-37b3-a56c60a0ec5b@redhat.com/
> > > >
> > > > This doesn't solve the question of whether we have enough nodes, but it's
> > > > not worse than if we use CFMWS regions and fits within existing ACPI spec.
> > > >
> > > > The only reason I can immediately think of to not do this, is that it might be
> > > > a pain to later change over to dynamic numa node allocation in a fashion that
> > > > then ignores SRAT entries.  Probably a solvable problem.  
> > >
> > > Interesting, yes, that works for expanding the NUMA node number space.
> > > However, if you populate SRAT what do you put in the corresponding
> > > HMAT entries? In the case of dynamic CXL regions the driver is going
> > > to generate the equivalent of the corresponding HMAT data based on
> > > what devices it decides to place in that range. I actually do not know
> > > what happens with HMAT today for memory hotplug, but I suspect there
> > > are less degrees of freedom of what might populate those ranges than
> > > what CXL allows, and there is a chance to pre-populate the HMAT for
> > > future hotplug.  
> >
> > So... There are two answers to that question as I understand it.
> >
> > 1) What Linux does is nothing.  You get whatever was in HMAT to start with.
> >    Worth noting that HMAT doesn't need to be in any sense 'complete' so it
> >    is possible there was nothing there with a target in this NUMA node.
> >
> > 2) What ACPI intends to happen if anyone implements it.  There is an event
> > notification for this..
> >
> > "Heterogeneous Memory Attributes Update. Dynamic reconfiguration of
> > the system may cause existing latency, bandwidth or memory side caching
> > attribute to change. The platform software issues the Heterogeneous
> > Memory Attributes Update notification to a point on a device tree to
> > indicate to OSPM that it needs to invoke the _HMA objects associated
> > with the Heterogeneous Memory Attributes on the device tree starting
> > from the point notified."
> >
> > So call an AML method in DSDT for which ever device has a notification event.
> > A similar dance is actually implemented for NFIT updates and Linux
> > actually implements that handling for that one.  
> 
> Oh, yes, I know I helped implement it, and I think it is objectively a
> terrible interface because you have no idea what changed or is allowed
> to change. _HMA is slightly less problematic because it can't invent
> new ranges, but I have low expectations that the BIOS would be able to
> coordinate properly with the OS that has done the dynamic
> re-configuration of the CXL topology. The OS could tell the BIOS what
> to put in the HMAT, but might as well skip that step and just have the
> OS populate its parsed copy of the HMAT data. I like your "just don't
> publish HMAT for entries for these ranges" observation better.

I absolutely aggree and wouldn't suggest using _HMA for CXL devices.
The OS can do a better job (I'm sure the firmware folks will disagree)
Nothing says we have to take any notice even if there is such an update :)

It's more reasonable for DIMM hotplug where some level of firmware is heavily
involved already or possibly for virtualization if it didn't know the right
answer at boot for some reason.  Then a hotplug controller / management
controller can fill in the details.

Jonathan
diff mbox series

Patch

diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index b8795fc49097..66a0142dc78c 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -298,6 +298,47 @@  acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 out_err:
 	return -EINVAL;
 }
+
+static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
+				   void *arg, const unsigned long table_end)
+{
+	struct acpi_cedt_cfmws *cfmws;
+	int *fake_pxm = arg;
+	u64 start, end;
+	int node;
+
+	cfmws = (struct acpi_cedt_cfmws *)header;
+	start = cfmws->base_hpa;
+	end = cfmws->base_hpa + cfmws->window_size;
+
+	/* Skip if the SRAT already described the NUMA details for this HPA */
+	node = phys_to_target_node(start);
+	if (node != NUMA_NO_NODE)
+		return 0;
+
+	node = acpi_map_pxm_to_node(*fake_pxm);
+
+	if (node == NUMA_NO_NODE) {
+		pr_err("ACPI NUMA: Too many proximity domains while processing CFMWS.\n");
+		return -EINVAL;
+	}
+
+	if (numa_add_memblk(node, start, end) < 0) {
+		/* CXL driver must handle the NUMA_NO_NODE case */
+		pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
+			node, start, end);
+	}
+
+	/* Set the next available fake_pxm value */
+	(*fake_pxm)++;
+	return 0;
+}
+#else
+static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
+				   void *arg, const unsigned long table_end)
+{
+	return 0;
+}
 #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
 
 static int __init acpi_parse_slit(struct acpi_table_header *table)
@@ -442,7 +483,7 @@  acpi_table_parse_srat(enum acpi_srat_type id,
 
 int __init acpi_numa_init(void)
 {
-	int cnt = 0;
+	int i, fake_pxm, cnt = 0;
 
 	if (acpi_disabled)
 		return -EINVAL;
@@ -478,6 +519,22 @@  int __init acpi_numa_init(void)
 	/* SLIT: System Locality Information Table */
 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
+	/*
+	 * CXL Fixed Memory Window Structures (CFMWS) must be parsed
+	 * after the SRAT. Create NUMA Nodes for CXL memory ranges that
+	 * are defined in the CFMWS and not already defined in the SRAT.
+	 * Initialize a fake_pxm as the first available PXM to emulate.
+	 */
+
+	/* fake_pxm is the next unused PXM value after SRAT parsing */
+	for (i = 0, fake_pxm = -1; i < MAX_NUMNODES - 1; i++) {
+		if (node_to_pxm_map[i] > fake_pxm)
+			fake_pxm = node_to_pxm_map[i];
+	}
+	fake_pxm++;
+	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws,
+			      &fake_pxm);
+
 	if (cnt < 0)
 		return cnt;
 	else if (!parsed_numa_memblks)
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 91e4072e7649..3163167ecc3a 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -125,7 +125,8 @@  static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 			cfmws->base_hpa + cfmws->window_size - 1);
 		return 0;
 	}
-	dev_dbg(dev, "add: %s range %#llx-%#llx\n", dev_name(&cxld->dev),
+	dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n",
+		dev_name(&cxld->dev), phys_to_target_node(cxld->range.start),
 		cfmws->base_hpa, cfmws->base_hpa + cfmws->window_size - 1);
 
 	return 0;