Message ID | 20211019050908.449231-1-alison.schofield@intel.com |
---|---|
State | New, archived |
Headers | show |
Series | [v3] ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT | expand |
On Tue, Oct 19, 2021 at 7:01 AM <alison.schofield@intel.com> wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > subtable may be assigned to a NUMA node. Since there is no > requirement that the SRAT be comprehensive for CXL memory another > mechanism is needed to assign NUMA nodes to CXL memory not identified > in the SRAT. > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > Early Discovery Table (CEDT) to find all CXL memory ranges. > Create a NUMA node for each CFMWS that is not already assigned to > a NUMA node. Add a memblk attaching its host physical address > range to the node. > > Note that these ranges may not actually map any memory at boot time. > They may describe persistent capacity or may be present to enable > hot-plug. > > Consumers can use phys_to_target_node() to discover the NUMA node. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> Dan, this seems to fall into your territory. > --- > Changes in v3: > - Initial variable pxm (Ben) > > Changes in v2: > - Use MAX_NUMNODES as max value when searching node_to_pxm_map() (0-day) > - Add braces around single statement for loop (coding style) > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like other > functions in this file doing similar work. > - Comments: remove superflous and state importance of the init order, > CFMWS after SRAT, (Ira, Dan) > - Add prototype for numa_add_memblk() (0-day) > > drivers/acpi/numa/srat.c | 70 ++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/acpi.c | 8 +++-- > include/linux/acpi.h | 1 + > 3 files changed, 76 insertions(+), 3 deletions(-) > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c > index b8795fc49097..daaaef58f1e6 100644 > --- a/drivers/acpi/numa/srat.c > +++ b/drivers/acpi/numa/srat.c > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) > } > #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header *acpi_cedt) > +{ > + struct acpi_cedt_cfmws *cfmws; > + acpi_size len, cur = 0; > + int i, node, pxm = 0; > + void *cedt_subtable; > + u64 start, end; > + > + /* Find the max PXM defined in the SRAT */ > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > + if (node_to_pxm_map[i] > pxm) > + pxm = node_to_pxm_map[i]; > + } > + /* Start assigning fake PXM values after the SRAT max PXM */ > + pxm++; > + > + len = acpi_cedt->length - sizeof(*acpi_cedt); > + cedt_subtable = acpi_cedt + 1; > + > + while (cur < len) { > + struct acpi_cedt_header *c = cedt_subtable + cur; > + > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > + goto next; > + > + cfmws = cedt_subtable + cur; > + if (cfmws->header.length < sizeof(*cfmws)) { > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > + cfmws->header.length); > + goto next; > + } > + > + start = cfmws->base_hpa; > + end = cfmws->base_hpa + cfmws->window_size; > + > + /* > + * Skip if the SRAT already described > + * the NUMA details for this HPA. > + */ > + node = phys_to_target_node(start); > + if (node != NUMA_NO_NODE) > + goto next; > + > + node = acpi_map_pxm_to_node(pxm); > + if (node == NUMA_NO_NODE) { > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > + return -EINVAL; > + } > + > + if (numa_add_memblk(node, start, end) < 0) { > + /* CXL driver must handle the NUMA_NO_NODE case */ > + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", > + node, start, end); > + } > + pxm++; > +next: > + cur += c->length; > + } > + return 0; > +} > + > static int __init acpi_parse_slit(struct acpi_table_header *table) > { > struct acpi_table_slit *slit = (struct acpi_table_slit *)table; > @@ -478,6 +539,15 @@ int __init acpi_numa_init(void) > /* SLIT: System Locality Information Table */ > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > + /* > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > + * must be parsed after the SRAT. It creates NUMA > + * Nodes for CXL memory ranges not already defined > + * in the SRAT and it assigns PXMs after the max PXM > + * defined in the SRAT. > + */ > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); acpi_table_parse() creates a memory mapping for accessing the table. so it usually is good to call acpi_put_table() when you are done with it to let that mapping go away. > + > if (cnt < 0) > return cnt; > else if (!parsed_numa_memblks) > diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c > index 10120e4e0b9f..ccf73f0a59a7 100644 > --- a/drivers/cxl/acpi.c > +++ b/drivers/cxl/acpi.c > @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct device *dev, > cfmws->base_hpa, cfmws->base_hpa + > cfmws->window_size - 1); > } else { > - dev_dbg(dev, "add: %s range %#llx-%#llx\n", > - dev_name(&cxld->dev), cfmws->base_hpa, > - cfmws->base_hpa + cfmws->window_size - 1); > + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", > + dev_name(&cxld->dev), > + phys_to_target_node(cxld->range.start), > + cfmws->base_hpa, > + cfmws->base_hpa + cfmws->window_size - 1); > } > cur += c->length; > } > diff --git a/include/linux/acpi.h b/include/linux/acpi.h > index 974d497a897d..f837fd715440 100644 > --- a/include/linux/acpi.h > +++ b/include/linux/acpi.h > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); > #ifdef CONFIG_ACPI_NUMA > int acpi_map_pxm_to_node(int pxm); > int acpi_get_node(acpi_handle handle); > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); > > /** > * pxm_to_online_node - Map proximity ID to online node > > base-commit: 64570fbc14f8d7cb3fe3995f20e26bc25ce4b2cc > prerequisite-patch-id: f09c67c6b3801f511521fd29c1cc01f5c5b1e9de > --
On Wed, Oct 20, 2021 at 5:26 PM Rafael J. Wysocki <rafael@kernel.org> wrote: > > On Tue, Oct 19, 2021 at 7:01 AM <alison.schofield@intel.com> wrote: > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > > subtable may be assigned to a NUMA node. Since there is no > > requirement that the SRAT be comprehensive for CXL memory another > > mechanism is needed to assign NUMA nodes to CXL memory not identified > > in the SRAT. > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > > Early Discovery Table (CEDT) to find all CXL memory ranges. > > Create a NUMA node for each CFMWS that is not already assigned to > > a NUMA node. Add a memblk attaching its host physical address > > range to the node. > > > > Note that these ranges may not actually map any memory at boot time. > > They may describe persistent capacity or may be present to enable > > hot-plug. > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > Dan, this seems to fall into your territory. > > > --- > > Changes in v3: > > - Initial variable pxm (Ben) > > > > Changes in v2: > > - Use MAX_NUMNODES as max value when searching node_to_pxm_map() (0-day) > > - Add braces around single statement for loop (coding style) > > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like other > > functions in this file doing similar work. > > - Comments: remove superflous and state importance of the init order, > > CFMWS after SRAT, (Ira, Dan) > > - Add prototype for numa_add_memblk() (0-day) > > > > drivers/acpi/numa/srat.c | 70 ++++++++++++++++++++++++++++++++++++++++ > > drivers/cxl/acpi.c | 8 +++-- > > include/linux/acpi.h | 1 + > > 3 files changed, 76 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c > > index b8795fc49097..daaaef58f1e6 100644 > > --- a/drivers/acpi/numa/srat.c > > +++ b/drivers/acpi/numa/srat.c > > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) > > } > > #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ > > > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header *acpi_cedt) > > +{ > > + struct acpi_cedt_cfmws *cfmws; > > + acpi_size len, cur = 0; > > + int i, node, pxm = 0; > > + void *cedt_subtable; > > + u64 start, end; > > + > > + /* Find the max PXM defined in the SRAT */ > > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > > + if (node_to_pxm_map[i] > pxm) > > + pxm = node_to_pxm_map[i]; > > + } > > + /* Start assigning fake PXM values after the SRAT max PXM */ > > + pxm++; > > + > > + len = acpi_cedt->length - sizeof(*acpi_cedt); > > + cedt_subtable = acpi_cedt + 1; > > + > > + while (cur < len) { > > + struct acpi_cedt_header *c = cedt_subtable + cur; > > + > > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > > + goto next; > > + > > + cfmws = cedt_subtable + cur; > > + if (cfmws->header.length < sizeof(*cfmws)) { > > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > > + cfmws->header.length); > > + goto next; > > + } > > + > > + start = cfmws->base_hpa; > > + end = cfmws->base_hpa + cfmws->window_size; > > + > > + /* > > + * Skip if the SRAT already described > > + * the NUMA details for this HPA. > > + */ > > + node = phys_to_target_node(start); > > + if (node != NUMA_NO_NODE) > > + goto next; > > + > > + node = acpi_map_pxm_to_node(pxm); > > + if (node == NUMA_NO_NODE) { > > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > > + return -EINVAL; > > + } > > + > > + if (numa_add_memblk(node, start, end) < 0) { > > + /* CXL driver must handle the NUMA_NO_NODE case */ > > + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", > > + node, start, end); > > + } > > + pxm++; > > +next: > > + cur += c->length; > > + } > > + return 0; > > +} > > + > > static int __init acpi_parse_slit(struct acpi_table_header *table) > > { > > struct acpi_table_slit *slit = (struct acpi_table_slit *)table; > > @@ -478,6 +539,15 @@ int __init acpi_numa_init(void) > > /* SLIT: System Locality Information Table */ > > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > > > + /* > > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > > + * must be parsed after the SRAT. It creates NUMA > > + * Nodes for CXL memory ranges not already defined > > + * in the SRAT and it assigns PXMs after the max PXM > > + * defined in the SRAT. > > + */ > > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); > > acpi_table_parse() creates a memory mapping for accessing the table. > so it usually is good to call acpi_put_table() when you are done with > it to let that mapping go away. Sorry, this isn't right. acpi_table_parse() calls acpi_put_table() by itself, so scratch the above.
Hi Alison, > -----Original Message----- > From: alison.schofield@intel.com <alison.schofield@intel.com> > Sent: Tuesday, October 19, 2021 12:09 AM > To: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>; > Vishal Verma <vishal.l.verma@intel.com>; Ira Weiny <ira.weiny@intel.com>; > Ben Widawsky <ben.widawsky@intel.com>; Dan Williams > <dan.j.williams@intel.com> > Cc: Alison Schofield <alison.schofield@intel.com>; linux-cxl@vger.kernel.org; > linux-acpi@vger.kernel.org > Subject: [PATCH v3] ACPI: NUMA: Add a node and memblk for each CFMWS > not in SRAT > > From: Alison Schofield <alison.schofield@intel.com> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > subtable may be assigned to a NUMA node. Since there is no requirement > that the SRAT be comprehensive for CXL memory another mechanism is > needed to assign NUMA nodes to CXL memory not identified in the SRAT. > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL Early > Discovery Table (CEDT) to find all CXL memory ranges. > Create a NUMA node for each CFMWS that is not already assigned to a > NUMA node. Add a memblk attaching its host physical address range to the > node. > > Note that these ranges may not actually map any memory at boot time. > They may describe persistent capacity or may be present to enable hot-plug. > > Consumers can use phys_to_target_node() to discover the NUMA node. Does this patch work for CXL type 2 memory which is not in SRAT? A type 2 driver can find its HDM BASE physical address from its CXL registers and figure out its NUMA node id by calling phys_to_target_node? Or is type 2 HDM currently being skipped altogether? Thanks > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > Changes in v3: > - Initial variable pxm (Ben) > > Changes in v2: > - Use MAX_NUMNODES as max value when searching node_to_pxm_map() > (0-day) > - Add braces around single statement for loop (coding style) > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like other > functions in this file doing similar work. > - Comments: remove superflous and state importance of the init order, > CFMWS after SRAT, (Ira, Dan) > - Add prototype for numa_add_memblk() (0-day) > > drivers/acpi/numa/srat.c | 70 > ++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/acpi.c | 8 +++-- > include/linux/acpi.h | 1 + > 3 files changed, 76 insertions(+), 3 deletions(-) > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index > b8795fc49097..daaaef58f1e6 100644 > --- a/drivers/acpi/numa/srat.c > +++ b/drivers/acpi/numa/srat.c > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct > acpi_srat_mem_affinity *ma) } #endif /* defined(CONFIG_X86) || defined > (CONFIG_ARM64) */ > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header > +*acpi_cedt) { > + struct acpi_cedt_cfmws *cfmws; > + acpi_size len, cur = 0; > + int i, node, pxm = 0; > + void *cedt_subtable; > + u64 start, end; > + > + /* Find the max PXM defined in the SRAT */ > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > + if (node_to_pxm_map[i] > pxm) > + pxm = node_to_pxm_map[i]; > + } > + /* Start assigning fake PXM values after the SRAT max PXM */ > + pxm++; > + > + len = acpi_cedt->length - sizeof(*acpi_cedt); > + cedt_subtable = acpi_cedt + 1; > + > + while (cur < len) { > + struct acpi_cedt_header *c = cedt_subtable + cur; > + > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > + goto next; > + > + cfmws = cedt_subtable + cur; > + if (cfmws->header.length < sizeof(*cfmws)) { > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > + cfmws->header.length); > + goto next; > + } > + > + start = cfmws->base_hpa; > + end = cfmws->base_hpa + cfmws->window_size; > + > + /* > + * Skip if the SRAT already described > + * the NUMA details for this HPA. > + */ > + node = phys_to_target_node(start); > + if (node != NUMA_NO_NODE) > + goto next; > + > + node = acpi_map_pxm_to_node(pxm); > + if (node == NUMA_NO_NODE) { > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > + return -EINVAL; > + } > + > + if (numa_add_memblk(node, start, end) < 0) { > + /* CXL driver must handle the NUMA_NO_NODE case */ > + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node > %d [mem %#llx-%#llx]\n", > + node, start, end); > + } > + pxm++; > +next: > + cur += c->length; > + } > + return 0; > +} > + > static int __init acpi_parse_slit(struct acpi_table_header *table) { > struct acpi_table_slit *slit = (struct acpi_table_slit *)table; @@ -478,6 > +539,15 @@ int __init acpi_numa_init(void) > /* SLIT: System Locality Information Table */ > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > + /* > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > + * must be parsed after the SRAT. It creates NUMA > + * Nodes for CXL memory ranges not already defined > + * in the SRAT and it assigns PXMs after the max PXM > + * defined in the SRAT. > + */ > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); > + > if (cnt < 0) > return cnt; > else if (!parsed_numa_memblks) > diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index > 10120e4e0b9f..ccf73f0a59a7 100644 > --- a/drivers/cxl/acpi.c > +++ b/drivers/cxl/acpi.c > @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct device > *dev, > cfmws->base_hpa, cfmws->base_hpa + > cfmws->window_size - 1); > } else { > - dev_dbg(dev, "add: %s range %#llx-%#llx\n", > - dev_name(&cxld->dev), cfmws->base_hpa, > - cfmws->base_hpa + cfmws->window_size - 1); > + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", > + dev_name(&cxld->dev), > + phys_to_target_node(cxld->range.start), > + cfmws->base_hpa, > + cfmws->base_hpa + cfmws->window_size - > + 1); > } > cur += c->length; > } > diff --git a/include/linux/acpi.h b/include/linux/acpi.h index > 974d497a897d..f837fd715440 100644 > --- a/include/linux/acpi.h > +++ b/include/linux/acpi.h > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); #ifdef > CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int > acpi_get_node(acpi_handle handle); > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); > > /** > * pxm_to_online_node - Map proximity ID to online node > > base-commit: 64570fbc14f8d7cb3fe3995f20e26bc25ce4b2cc > prerequisite-patch-id: f09c67c6b3801f511521fd29c1cc01f5c5b1e9de > -- > 2.31.1
On Wed, Oct 20, 2021 at 10:03:58PM +0000, Vikram Sethi wrote: > Hi Alison, > > > -----Original Message----- > > From: alison.schofield@intel.com <alison.schofield@intel.com> > > Sent: Tuesday, October 19, 2021 12:09 AM > > To: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>; > > Vishal Verma <vishal.l.verma@intel.com>; Ira Weiny <ira.weiny@intel.com>; > > Ben Widawsky <ben.widawsky@intel.com>; Dan Williams > > <dan.j.williams@intel.com> > > Cc: Alison Schofield <alison.schofield@intel.com>; linux-cxl@vger.kernel.org; > > linux-acpi@vger.kernel.org > > Subject: [PATCH v3] ACPI: NUMA: Add a node and memblk for each CFMWS > > not in SRAT > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > > subtable may be assigned to a NUMA node. Since there is no requirement > > that the SRAT be comprehensive for CXL memory another mechanism is > > needed to assign NUMA nodes to CXL memory not identified in the SRAT. > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL Early > > Discovery Table (CEDT) to find all CXL memory ranges. > > Create a NUMA node for each CFMWS that is not already assigned to a > > NUMA node. Add a memblk attaching its host physical address range to the > > node. > > > > Note that these ranges may not actually map any memory at boot time. > > They may describe persistent capacity or may be present to enable hot-plug. > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > Does this patch work for CXL type 2 memory which is not in SRAT? A type 2 driver > can find its HDM BASE physical address from its CXL registers and figure out > its NUMA node id by calling phys_to_target_node? Yes. This adds the nodes for the case where the BIOS doesn't fully describe everything in CFMWS in the SRAT. And, yes, that is how the NUMA node can be discovered. > Or is type 2 HDM currently being skipped altogether? Not sure what you mean by 'being skipped altogether'? The BIOS may describe (all or none or some) of CXL Memory in the SRAT. In the case where BIOS describes it all, NUMA nodes will already exist, and no new nodes will be added here. Alison > > Thanks > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > --- > > Changes in v3: > > - Initial variable pxm (Ben) > > > > Changes in v2: > > - Use MAX_NUMNODES as max value when searching node_to_pxm_map() > > (0-day) > > - Add braces around single statement for loop (coding style) > > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like other > > functions in this file doing similar work. > > - Comments: remove superflous and state importance of the init order, > > CFMWS after SRAT, (Ira, Dan) > > - Add prototype for numa_add_memblk() (0-day) > > > > drivers/acpi/numa/srat.c | 70 > > ++++++++++++++++++++++++++++++++++++++++ > > drivers/cxl/acpi.c | 8 +++-- > > include/linux/acpi.h | 1 + > > 3 files changed, 76 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index > > b8795fc49097..daaaef58f1e6 100644 > > --- a/drivers/acpi/numa/srat.c > > +++ b/drivers/acpi/numa/srat.c > > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct > > acpi_srat_mem_affinity *ma) } #endif /* defined(CONFIG_X86) || defined > > (CONFIG_ARM64) */ > > > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header > > +*acpi_cedt) { > > + struct acpi_cedt_cfmws *cfmws; > > + acpi_size len, cur = 0; > > + int i, node, pxm = 0; > > + void *cedt_subtable; > > + u64 start, end; > > + > > + /* Find the max PXM defined in the SRAT */ > > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > > + if (node_to_pxm_map[i] > pxm) > > + pxm = node_to_pxm_map[i]; > > + } > > + /* Start assigning fake PXM values after the SRAT max PXM */ > > + pxm++; > > + > > + len = acpi_cedt->length - sizeof(*acpi_cedt); > > + cedt_subtable = acpi_cedt + 1; > > + > > + while (cur < len) { > > + struct acpi_cedt_header *c = cedt_subtable + cur; > > + > > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > > + goto next; > > + > > + cfmws = cedt_subtable + cur; > > + if (cfmws->header.length < sizeof(*cfmws)) { > > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > > + cfmws->header.length); > > + goto next; > > + } > > + > > + start = cfmws->base_hpa; > > + end = cfmws->base_hpa + cfmws->window_size; > > + > > + /* > > + * Skip if the SRAT already described > > + * the NUMA details for this HPA. > > + */ > > + node = phys_to_target_node(start); > > + if (node != NUMA_NO_NODE) > > + goto next; > > + > > + node = acpi_map_pxm_to_node(pxm); > > + if (node == NUMA_NO_NODE) { > > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > > + return -EINVAL; > > + } > > + > > + if (numa_add_memblk(node, start, end) < 0) { > > + /* CXL driver must handle the NUMA_NO_NODE case */ > > + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node > > %d [mem %#llx-%#llx]\n", > > + node, start, end); > > + } > > + pxm++; > > +next: > > + cur += c->length; > > + } > > + return 0; > > +} > > + > > static int __init acpi_parse_slit(struct acpi_table_header *table) { > > struct acpi_table_slit *slit = (struct acpi_table_slit *)table; @@ -478,6 > > +539,15 @@ int __init acpi_numa_init(void) > > /* SLIT: System Locality Information Table */ > > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > > > + /* > > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > > + * must be parsed after the SRAT. It creates NUMA > > + * Nodes for CXL memory ranges not already defined > > + * in the SRAT and it assigns PXMs after the max PXM > > + * defined in the SRAT. > > + */ > > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); > > + > > if (cnt < 0) > > return cnt; > > else if (!parsed_numa_memblks) > > diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index > > 10120e4e0b9f..ccf73f0a59a7 100644 > > --- a/drivers/cxl/acpi.c > > +++ b/drivers/cxl/acpi.c > > @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct device > > *dev, > > cfmws->base_hpa, cfmws->base_hpa + > > cfmws->window_size - 1); > > } else { > > - dev_dbg(dev, "add: %s range %#llx-%#llx\n", > > - dev_name(&cxld->dev), cfmws->base_hpa, > > - cfmws->base_hpa + cfmws->window_size - 1); > > + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", > > + dev_name(&cxld->dev), > > + phys_to_target_node(cxld->range.start), > > + cfmws->base_hpa, > > + cfmws->base_hpa + cfmws->window_size - > > + 1); > > } > > cur += c->length; > > } > > diff --git a/include/linux/acpi.h b/include/linux/acpi.h index > > 974d497a897d..f837fd715440 100644 > > --- a/include/linux/acpi.h > > +++ b/include/linux/acpi.h > > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); #ifdef > > CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int > > acpi_get_node(acpi_handle handle); > > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); > > > > /** > > * pxm_to_online_node - Map proximity ID to online node > > > > base-commit: 64570fbc14f8d7cb3fe3995f20e26bc25ce4b2cc > > prerequisite-patch-id: f09c67c6b3801f511521fd29c1cc01f5c5b1e9de > > -- > > 2.31.1 >
> -----Original Message----- > From: Alison Schofield <alison.schofield@intel.com> > Sent: Wednesday, October 20, 2021 8:00 PM > To: Vikram Sethi <vsethi@nvidia.com> > Cc: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>; > Vishal Verma <vishal.l.verma@intel.com>; Ira Weiny <ira.weiny@intel.com>; > Ben Widawsky <ben.widawsky@intel.com>; Dan Williams > <dan.j.williams@intel.com>; linux-cxl@vger.kernel.org; linux- > acpi@vger.kernel.org > Subject: Re: [PATCH v3] ACPI: NUMA: Add a node and memblk for each > CFMWS not in SRAT > snip > > > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > Does this patch work for CXL type 2 memory which is not in SRAT? A > > type 2 driver can find its HDM BASE physical address from its CXL > > registers and figure out its NUMA node id by calling phys_to_target_node? > > Yes. This adds the nodes for the case where the BIOS doesn't fully describe > everything in CFMWS in the SRAT. And, yes, that is how the NUMA node can > be discovered. > > > Or is type 2 HDM currently being skipped altogether? > > Not sure what you mean by 'being skipped altogether'? The BIOS may > describe (all or none or some) of CXL Memory in the SRAT. In the case where > BIOS describes it all, NUMA nodes will already exist, and no new nodes will > be added here. > My question about skipping type2 wasn't directly related to your patch, but more of a question about current upstream support for probe/configuration of type 2 accelerator devices memory, irrespective of whether FW shows type 2 memory in SRAT. The desired outcome is that the kernel CXL driver recognizes such type 2 HDM, and assigns it a NUMA node such that the type 2 driver can later add/online this memory, via add_memory_driver_managed which requires a NUMA node ID (which driver can discover after your patch by calling phys_to_target_node). Would the current upstream code for HDM work as described above, and if so, does it rely on CDAT DSEMTS structure showing a specific value for EFI memory type? i.e would it work if that field in DSEMTS was either EFI_CONVENTIONAL_MEMORY with EFI_MEMORY_SP, or EFI_RESERVED_MEMORY? Thanks, Vikram > Alison > > > > > Thanks > > > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > --- > > > Changes in v3: > > > - Initial variable pxm (Ben) > > > > > > Changes in v2: > > > - Use MAX_NUMNODES as max value when searching > node_to_pxm_map() > > > (0-day) > > > - Add braces around single statement for loop (coding style) > > > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like > other > > > functions in this file doing similar work. > > > - Comments: remove superflous and state importance of the init order, > > > CFMWS after SRAT, (Ira, Dan) > > > - Add prototype for numa_add_memblk() (0-day) > > > > > > drivers/acpi/numa/srat.c | 70 > > > ++++++++++++++++++++++++++++++++++++++++ > > > drivers/cxl/acpi.c | 8 +++-- > > > include/linux/acpi.h | 1 + > > > 3 files changed, 76 insertions(+), 3 deletions(-) > > > > > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c > > > index > > > b8795fc49097..daaaef58f1e6 100644 > > > --- a/drivers/acpi/numa/srat.c > > > +++ b/drivers/acpi/numa/srat.c > > > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct > > > acpi_srat_mem_affinity *ma) } #endif /* defined(CONFIG_X86) || > > > defined > > > (CONFIG_ARM64) */ > > > > > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header > > > +*acpi_cedt) { > > > + struct acpi_cedt_cfmws *cfmws; > > > + acpi_size len, cur = 0; > > > + int i, node, pxm = 0; > > > + void *cedt_subtable; > > > + u64 start, end; > > > + > > > + /* Find the max PXM defined in the SRAT */ > > > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > > > + if (node_to_pxm_map[i] > pxm) > > > + pxm = node_to_pxm_map[i]; > > > + } > > > + /* Start assigning fake PXM values after the SRAT max PXM */ > > > + pxm++; > > > + > > > + len = acpi_cedt->length - sizeof(*acpi_cedt); > > > + cedt_subtable = acpi_cedt + 1; > > > + > > > + while (cur < len) { > > > + struct acpi_cedt_header *c = cedt_subtable + cur; > > > + > > > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > > > + goto next; > > > + > > > + cfmws = cedt_subtable + cur; > > > + if (cfmws->header.length < sizeof(*cfmws)) { > > > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > > > + cfmws->header.length); > > > + goto next; > > > + } > > > + > > > + start = cfmws->base_hpa; > > > + end = cfmws->base_hpa + cfmws->window_size; > > > + > > > + /* > > > + * Skip if the SRAT already described > > > + * the NUMA details for this HPA. > > > + */ > > > + node = phys_to_target_node(start); > > > + if (node != NUMA_NO_NODE) > > > + goto next; > > > + > > > + node = acpi_map_pxm_to_node(pxm); > > > + if (node == NUMA_NO_NODE) { > > > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > > > + return -EINVAL; > > > + } > > > + > > > + if (numa_add_memblk(node, start, end) < 0) { > > > + /* CXL driver must handle the NUMA_NO_NODE case */ > > > + pr_warn("ACPI NUMA: Failed to add memblk for > > > + CFMWS node > > > %d [mem %#llx-%#llx]\n", > > > + node, start, end); > > > + } > > > + pxm++; > > > +next: > > > + cur += c->length; > > > + } > > > + return 0; > > > +} > > > + > > > static int __init acpi_parse_slit(struct acpi_table_header *table) { > > > struct acpi_table_slit *slit = (struct acpi_table_slit > > > *)table; @@ -478,6 > > > +539,15 @@ int __init acpi_numa_init(void) > > > /* SLIT: System Locality Information Table */ > > > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > > > > > + /* > > > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > > > + * must be parsed after the SRAT. It creates NUMA > > > + * Nodes for CXL memory ranges not already defined > > > + * in the SRAT and it assigns PXMs after the max PXM > > > + * defined in the SRAT. > > > + */ > > > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); > > > + > > > if (cnt < 0) > > > return cnt; > > > else if (!parsed_numa_memblks) diff --git > > > a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index > > > 10120e4e0b9f..ccf73f0a59a7 100644 > > > --- a/drivers/cxl/acpi.c > > > +++ b/drivers/cxl/acpi.c > > > @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct > > > device *dev, > > > cfmws->base_hpa, cfmws->base_hpa + > > > cfmws->window_size - 1); > > > } else { > > > - dev_dbg(dev, "add: %s range %#llx-%#llx\n", > > > - dev_name(&cxld->dev), cfmws->base_hpa, > > > - cfmws->base_hpa + cfmws->window_size - 1); > > > + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", > > > + dev_name(&cxld->dev), > > > + phys_to_target_node(cxld->range.start), > > > + cfmws->base_hpa, > > > + cfmws->base_hpa + cfmws->window_size > > > + - 1); > > > } > > > cur += c->length; > > > } > > > diff --git a/include/linux/acpi.h b/include/linux/acpi.h index > > > 974d497a897d..f837fd715440 100644 > > > --- a/include/linux/acpi.h > > > +++ b/include/linux/acpi.h > > > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); #ifdef > > > CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int > > > acpi_get_node(acpi_handle handle); > > > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); > > > > > > /** > > > * pxm_to_online_node - Map proximity ID to online node > > > > > > base-commit: 64570fbc14f8d7cb3fe3995f20e26bc25ce4b2cc > > > prerequisite-patch-id: f09c67c6b3801f511521fd29c1cc01f5c5b1e9de > > > -- > > > 2.31.1 > >
On Thu, Oct 21, 2021 at 8:57 AM Vikram Sethi <vsethi@nvidia.com> wrote: > > > > > -----Original Message----- > > From: Alison Schofield <alison.schofield@intel.com> > > Sent: Wednesday, October 20, 2021 8:00 PM > > To: Vikram Sethi <vsethi@nvidia.com> > > Cc: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>; > > Vishal Verma <vishal.l.verma@intel.com>; Ira Weiny <ira.weiny@intel.com>; > > Ben Widawsky <ben.widawsky@intel.com>; Dan Williams > > <dan.j.williams@intel.com>; linux-cxl@vger.kernel.org; linux- > > acpi@vger.kernel.org > > Subject: Re: [PATCH v3] ACPI: NUMA: Add a node and memblk for each > > CFMWS not in SRAT > > > snip > > > > > > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > > > Does this patch work for CXL type 2 memory which is not in SRAT? A > > > type 2 driver can find its HDM BASE physical address from its CXL > > > registers and figure out its NUMA node id by calling phys_to_target_node? > > > > Yes. This adds the nodes for the case where the BIOS doesn't fully describe > > everything in CFMWS in the SRAT. And, yes, that is how the NUMA node can > > be discovered. > > > > > Or is type 2 HDM currently being skipped altogether? > > > > Not sure what you mean by 'being skipped altogether'? The BIOS may > > describe (all or none or some) of CXL Memory in the SRAT. In the case where > > BIOS describes it all, NUMA nodes will already exist, and no new nodes will > > be added here. > > > My question about skipping type2 wasn't directly related to your patch, > but more of a question about current upstream support for probe/configuration > of type 2 accelerator devices memory, irrespective of whether FW shows type 2 > memory in SRAT. SRAT only has Type-2 ranges if the platform firmware maps the device's memory into the EFI memory map (includes ACPI SRAT / SLIT / HMAT population). I expect that situation to be negotiated on a case by case basis between Type-2 device vendors and platform firmware vendors. There is no requirement that any CXL memory, type-2 or type-3, is mapped by platform firmware. Per the CDAT specification, platform firmware is capable to map CXL into the EFI memory map at boot, but there is no requirement for it to do so. My expectation is that Linux will need to handle the full gamut of possibilities here, i.e. all / some / none of the CXL Type-3 devices present at boot mapped into the EFI memory map, and all / some / none of the CXL Type-2 devices mapped into the EFI memory map. > The desired outcome is that the kernel CXL driver recognizes such type 2 HDM, > and assigns it a NUMA node such that the type 2 driver Note that there's no driver involved at this point. Alison's patch is just augmenting the ACPI declared NUMA nodes at boot so that the core-mm is not surprised by undeclared NUMA nodes at add_memory_driver_managed() time. > can later add/online this memory, > via add_memory_driver_managed which requires a NUMA node ID (which driver can > discover after your patch by calling phys_to_target_node). Yes, with this patch there are at least enough nodes for add_memory_driver_managed() to have a reasonable answer for a NUMA node for Type-2 memory. However, as Jonathan and I were discussing, this minimum enabling may prove insufficient if, for example, you had one CFMWS entry for all Type-2 memory in the system, but multiple disparate accelerators that want to each do add_memory_driver_managed(). In that scenario all of those accelerators, which might want to have a target-node per target-device, will all share one target-node. That said, unless and until it becomes clear that system architectures require Linux to define multiple nodes per CFMWS, I am happy to kick that can down the road. Also, platform firmware can solve this problem by subdividing Type-2 with multiple QTG ids so that multiple target devices can each be assigned to a different CFMWS entry sandbox, i.e. with more degrees of freedom declared by platform firmware in the CFMWS it relieves pressure on the OS to need a dynamic NUMA node definition capability. > Would the current upstream code for HDM work as described above, Current upstream code that enumerates Type-2 is the cxl_acpi driver that enumerates platform CXL capabilities. > and if so, does it > rely on CDAT DSEMTS structure showing a specific value for EFI memory type? i.e would it > work if that field in DSEMTS was either EFI_CONVENTIONAL_MEMORY with EFI_MEMORY_SP, > or EFI_RESERVED_MEMORY? If platform firmware maps the HDM the expectation is that it will use the CDAT to determine the EFI memory type. If platform firmware declines to map the device and lets Linux map it then that's de-facto "reserved" memory and the driver (generic CXL-Type-3 / or vendor specific CXL-Type-2) gets to do insert_resource() with whatever Linux type it deems appropriate, i.e. EFI is out of the picture in this scenario.
Hi Dan, Thanks for the detailed response. Some comments/questions inline. > -----Original Message----- > From: Dan Williams <dan.j.williams@intel.com> > Sent: Thursday, October 21, 2021 9:01 PM > To: Vikram Sethi <vsethi@nvidia.com> > Cc: Alison Schofield <alison.schofield@intel.com>; Rafael J. Wysocki > <rafael@kernel.org>; Len Brown <lenb@kernel.org>; Vishal Verma > <vishal.l.verma@intel.com>; Ira Weiny <ira.weiny@intel.com>; Ben > Widawsky <ben.widawsky@intel.com>; linux-cxl@vger.kernel.org; linux- > acpi@vger.kernel.org > Subject: Re: [PATCH v3] ACPI: NUMA: Add a node and memblk for each > CFMWS not in SRAT > > On Thu, Oct 21, 2021 at 8:57 AM Vikram Sethi <vsethi@nvidia.com> wrote: > > > > > -----Original Message----- > > > From: Alison Schofield <alison.schofield@intel.com> > > > Sent: Wednesday, October 20, 2021 8:00 PM > > > To: Vikram Sethi <vsethi@nvidia.com> > > > Cc: Rafael J. Wysocki <rafael@kernel.org>; Len Brown > > > <lenb@kernel.org>; Vishal Verma <vishal.l.verma@intel.com>; Ira > > > Weiny <ira.weiny@intel.com>; Ben Widawsky > <ben.widawsky@intel.com>; > > > Dan Williams <dan.j.williams@intel.com>; linux-cxl@vger.kernel.org; > > > linux- acpi@vger.kernel.org > > > Subject: Re: [PATCH v3] ACPI: NUMA: Add a node and memblk for each > > > CFMWS not in SRAT > > > > > snip > > > > > > > > > > > > Consumers can use phys_to_target_node() to discover the NUMA > node. > > > > > > > > Does this patch work for CXL type 2 memory which is not in SRAT? A > > > > type 2 driver can find its HDM BASE physical address from its CXL > > > > registers and figure out its NUMA node id by calling > phys_to_target_node? > > > > > > Yes. This adds the nodes for the case where the BIOS doesn't fully > > > describe everything in CFMWS in the SRAT. And, yes, that is how the > > > NUMA node can be discovered. > > > > > > > Or is type 2 HDM currently being skipped altogether? > > > > > > Not sure what you mean by 'being skipped altogether'? The BIOS may > > > describe (all or none or some) of CXL Memory in the SRAT. In the > > > case where BIOS describes it all, NUMA nodes will already exist, and > > > no new nodes will be added here. > > > > > My question about skipping type2 wasn't directly related to your > > patch, but more of a question about current upstream support for > > probe/configuration of type 2 accelerator devices memory, irrespective > > of whether FW shows type 2 memory in SRAT. > > SRAT only has Type-2 ranges if the platform firmware maps the device's > memory into the EFI memory map (includes ACPI SRAT / SLIT / HMAT > population). I expect that situation to be negotiated on a case by case basis > between Type-2 device vendors and platform firmware vendors. Agreed > There is no requirement that any CXL memory, type-2 or type-3, is mapped by platform > firmware. Per the CDAT specification, platform firmware is capable to map > CXL into the EFI memory map at boot, but there is no requirement for it to do > so. > > My expectation is that Linux will need to handle the full gamut of possibilities > here, i.e. all / some / none of the CXL Type-3 devices present at boot > mapped into the EFI memory map, and all / some / none of the CXL Type-2 > devices mapped into the EFI memory map. > Agreed. IIUC, if FW has initialized HDM base/decoder HPA in device, shown device HDM HPA in EFI Memory Map as reserved based on CDAT, and shown PXM for Device memory in SRAT, kernel can validate no conflicts in FW HDM HPA assignments vs CFMWS, and then map the PXM from SRAT to a node ID. The CFMWS DSM seems unnecessary for this case to get the NUMA assignments, right? Type 2 Driver calls phys_to_target_node() and then add_memory_driver_managed. > > The desired outcome is that the kernel CXL driver recognizes such type > > 2 HDM, and assigns it a NUMA node such that the type 2 driver > > Note that there's no driver involved at this point. Alison's patch is just > augmenting the ACPI declared NUMA nodes at boot so that the core-mm is > not surprised by undeclared NUMA nodes at > add_memory_driver_managed() time. > > > can later add/online this memory, > > via add_memory_driver_managed which requires a NUMA node ID (which > > driver can discover after your patch by calling phys_to_target_node). > > Yes, with this patch there are at least enough nodes for > add_memory_driver_managed() to have a reasonable answer for a NUMA > node for Type-2 memory. However, as Jonathan and I were discussing, this > minimum enabling may prove insufficient if, for example, you had one > CFMWS entry for all Type-2 memory in the system, but multiple disparate > accelerators that want to each do add_memory_driver_managed(). CEDT CFMWS ECN says "The CFMWS structure describes zero or more Host Physical Address (HPA) windows associated with *each CXL Host Bridge*". So are you concerned about multiple type 2 accelerators under the same Host bridge? IIRC CXL 2.0 only allows 1 type 2 device under a host bridge, but perhaps that was under 1 "root port", and you are pointing out that a system that has multiple root ports under 1 CXL host bridge can have multiple CXL type 2 accelerators under it. Future revisions of the spec could always relax current restrictions even under 1 root port, so I do see the problem about multiple type 2 devices under 1 CFMWS window wanting their own NUMA nodes. Apologies if I'm mangling the terminology for host bridge vs root port, but trying to correlate with PCIe terminology of RC/RP. > In that scenario all of those accelerators, which might want to have a target-node > per target-device, will all share one target-node. That said, unless and until it > becomes clear that system architectures require Linux to define multiple > nodes per CFMWS, I am happy to kick that can down the road. Also, platform > firmware can solve this problem by subdividing > Type-2 with multiple QTG ids so that multiple target devices can each be > assigned to a different CFMWS entry sandbox, i.e. with more degrees of > freedom declared by platform firmware in the CFMWS it relieves pressure on > the OS to need a dynamic NUMA node definition capability. > Let's say there are 2 type 2 accelerators with the same latency/bandwidth properties under a given CXL host bridge. When the DSM is executed on this host bridge with the latency/BW as input, IIUC you're saying FW could return 2 possible QTG IDs, let's say 4 and 5? And for that host bridge, FW also create 2 CFMWS windows of HPA for type 2, with one showing QTG ID =4, and the other showing QTG ID=5, with each having at least enough HPA space to map 1 of those devices? > > Would the current upstream code for HDM work as described above, > > Current upstream code that enumerates Type-2 is the cxl_acpi driver that > enumerates platform CXL capabilities. > > > and if so, does it > > rely on CDAT DSEMTS structure showing a specific value for EFI memory > > type? i.e would it work if that field in DSEMTS was either > > EFI_CONVENTIONAL_MEMORY with EFI_MEMORY_SP, or > EFI_RESERVED_MEMORY? > > If platform firmware maps the HDM the expectation is that it will use the > CDAT to determine the EFI memory type. If platform firmware declines to > map the device and lets Linux map it then that's de-facto "reserved" memory > and the driver (generic CXL-Type-3 / or vendor specific CXL-Type-2) gets to > do insert_resource() with whatever Linux type it deems appropriate, i.e. EFI > is out of the picture in this scenario.
On Fri, Oct 22, 2021 at 2:58 PM Vikram Sethi <vsethi@nvidia.com> wrote: > > Hi Dan, > Thanks for the detailed response. Some comments/questions inline. [..] > > My expectation is that Linux will need to handle the full gamut of possibilities > > here, i.e. all / some / none of the CXL Type-3 devices present at boot > > mapped into the EFI memory map, and all / some / none of the CXL Type-2 > > devices mapped into the EFI memory map. > > > Agreed. IIUC, if FW has initialized HDM base/decoder HPA in device, shown > device HDM HPA in EFI Memory Map as reserved based on CDAT, and shown > PXM for Device memory in SRAT, kernel can validate no conflicts in FW HDM HPA > assignments vs CFMWS, and then map the PXM from SRAT to a node ID. The kernel will just trust the ACPI tables. I.e. there won't be any CFMWS + HDM validation before the kernel assigns a NUMA-node for an SRAT PXM. Just like the kernel does not validate DDR memory controller registers before trusting ACPI. > The CFMWS DSM seems unnecessary for this case to get the NUMA assignments, > right? Type 2 Driver calls phys_to_target_node() and then add_memory_driver_managed. Right, the QTG DSM has no production role for the OS if the BIOS has already done the work to map the HDM. It could still be used in a platform validation test case, or for strict consistency checks, but otherwise fall back to trusting EFI-MAP + SRAT + SLIT + HMAT if CFMWS + CDAT + QTG-DSM disagrees. > > > > The desired outcome is that the kernel CXL driver recognizes such type > > > 2 HDM, and assigns it a NUMA node such that the type 2 driver > > > > Note that there's no driver involved at this point. Alison's patch is just > > augmenting the ACPI declared NUMA nodes at boot so that the core-mm is > > not surprised by undeclared NUMA nodes at > > add_memory_driver_managed() time. > > > > > can later add/online this memory, > > > via add_memory_driver_managed which requires a NUMA node ID (which > > > driver can discover after your patch by calling phys_to_target_node). > > > > Yes, with this patch there are at least enough nodes for > > add_memory_driver_managed() to have a reasonable answer for a NUMA > > node for Type-2 memory. However, as Jonathan and I were discussing, this > > minimum enabling may prove insufficient if, for example, you had one > > CFMWS entry for all Type-2 memory in the system, but multiple disparate > > accelerators that want to each do add_memory_driver_managed(). > > CEDT CFMWS ECN says "The CFMWS structure describes zero or more Host Physical > Address (HPA) windows associated with *each CXL Host Bridge*". The next sentence clarifies that a CFMWS entry can interleave multiple host bridges. > So are you concerned about multiple type 2 accelerators under the same Host bridge? The concern is multiple disparate accelerator HDM ranges with only one CFMWS entry that they can map. > IIRC CXL 2.0 only allows 1 type 2 device under a host bridge, but perhaps that was > under 1 "root port", and you are pointing out that a system that has multiple > root ports under 1 CXL host bridge can have multiple CXL type 2 accelerators under it. > Future revisions of the spec could always relax current restrictions even under 1 > root port, so I do see the problem about multiple type 2 devices under 1 CFMWS > window wanting their own NUMA nodes. Apologies if I'm mangling the > terminology for host bridge vs root port, but trying to correlate with PCIe terminology > of RC/RP. CFMWS maps the hostbridge (RC) interleave, the upstream port inside that RC maps RP interleave. Yes, there is nothing that prevents ACPI from describing multiple NUMA nodes within a single CFMWS. However, setting ACPI aside, if you hot-added multiple Type-2 devices where the OS has to do dynamic assignment, per this patch they would only get assigned to the one NUMA node associated with a given CFMWS entry. If the OS wants to do multiple NUMA nodes per CFMWS it either needs to statically allocate more nodes up front, or rework the core-mm to allow dynamic node definition. That design decision is deferred until it's clear that one node per CFMWS (for hot added devices) is determined to be insufficient. > > In that scenario all of those accelerators, which might want to have a target-node > > per target-device, will all share one target-node. That said, unless and until it > > becomes clear that system architectures require Linux to define multiple > > nodes per CFMWS, I am happy to kick that can down the road. Also, platform > > firmware can solve this problem by subdividing > > Type-2 with multiple QTG ids so that multiple target devices can each be > > assigned to a different CFMWS entry sandbox, i.e. with more degrees of > > freedom declared by platform firmware in the CFMWS it relieves pressure on > > the OS to need a dynamic NUMA node definition capability. > > > Let's say there are 2 type 2 accelerators with the same latency/bandwidth > properties under a given CXL host bridge. When the DSM is executed on this host > bridge with the latency/BW as input, IIUC you're saying FW could return 2 possible > QTG IDs, let's say 4 and 5? And for that host bridge, FW also create 2 CFMWS > windows of HPA for type 2, with one showing QTG ID =4, and the other showing > QTG ID=5, with each having at least enough HPA space to map 1 of those devices? The DSM just translates a performance profile into a QTG id, so if 2 accelerators share the same performance it does not matter where they are plugged in they will map to the same QTG id. However, they can still map to different CFMWS entries if they are plugged into different hostbridges. If they share the same hostbridge then they will share the same CFMWS entry.
On Wed, Oct 20, 2021 at 8:27 AM Rafael J. Wysocki <rafael@kernel.org> wrote: > > On Tue, Oct 19, 2021 at 7:01 AM <alison.schofield@intel.com> wrote: > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > > subtable may be assigned to a NUMA node. Since there is no > > requirement that the SRAT be comprehensive for CXL memory another > > mechanism is needed to assign NUMA nodes to CXL memory not identified > > in the SRAT. > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > > Early Discovery Table (CEDT) to find all CXL memory ranges. > > Create a NUMA node for each CFMWS that is not already assigned to > > a NUMA node. Add a memblk attaching its host physical address > > range to the node. > > > > Note that these ranges may not actually map any memory at boot time. > > They may describe persistent capacity or may be present to enable > > hot-plug. > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > Dan, this seems to fall into your territory. Rafael, Sure, I'm happy to take this through the CXL tree with your ack.
On Mon, Oct 18, 2021 at 10:01 PM <alison.schofield@intel.com> wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > subtable may be assigned to a NUMA node. Since there is no > requirement that the SRAT be comprehensive for CXL memory another > mechanism is needed to assign NUMA nodes to CXL memory not identified > in the SRAT. > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > Early Discovery Table (CEDT) to find all CXL memory ranges. > Create a NUMA node for each CFMWS that is not already assigned to > a NUMA node. Add a memblk attaching its host physical address > range to the node. > > Note that these ranges may not actually map any memory at boot time. > They may describe persistent capacity or may be present to enable > hot-plug. > > Consumers can use phys_to_target_node() to discover the NUMA node. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > Changes in v3: > - Initial variable pxm (Ben) > > Changes in v2: > - Use MAX_NUMNODES as max value when searching node_to_pxm_map() (0-day) > - Add braces around single statement for loop (coding style) > - Rename acpi_parse_cfmws() to acpi_cxl_cfmws_init to be more like other > functions in this file doing similar work. > - Comments: remove superflous and state importance of the init order, > CFMWS after SRAT, (Ira, Dan) > - Add prototype for numa_add_memblk() (0-day) > > drivers/acpi/numa/srat.c | 70 ++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/acpi.c | 8 +++-- > include/linux/acpi.h | 1 + > 3 files changed, 76 insertions(+), 3 deletions(-) > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c > index b8795fc49097..daaaef58f1e6 100644 > --- a/drivers/acpi/numa/srat.c > +++ b/drivers/acpi/numa/srat.c > @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) > } > #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ > > +static int __init acpi_cxl_cfmws_init(struct acpi_table_header *acpi_cedt) > +{ > + struct acpi_cedt_cfmws *cfmws; > + acpi_size len, cur = 0; > + int i, node, pxm = 0; Shouldn't this be -1, on the idea that the first numa node to assign if none are set is zero? I don't think the way you have it is a problem in practice because SRAT should always be there in a NUMA system. However, the first CFMWS pxm should start after the last SRAT entry, or 0 if no SRAT entries. > + void *cedt_subtable; > + u64 start, end; > + > + /* Find the max PXM defined in the SRAT */ > + for (i = 0; i < MAX_NUMNODES - 1; i++) { How about: for (i = 0, pxm = -1; i < MAX_NUMNODES -1; i++) ...just to keep the initialization close to the use, but that's just a personal style preference. > + if (node_to_pxm_map[i] > pxm) > + pxm = node_to_pxm_map[i]; > + } > + /* Start assigning fake PXM values after the SRAT max PXM */ > + pxm++; > + > + len = acpi_cedt->length - sizeof(*acpi_cedt); > + cedt_subtable = acpi_cedt + 1; > + > + while (cur < len) { Similarly to above I wonder if this would be cleaner as a for loop then you could use typical "continue" statements rather than goto. I took a stab at creating a for_each_cedt() helper which ended up a decent cleanup for drivers/cxl/ drivers/cxl/acpi.c | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) ...however, I just realized this NUMA code is running at init time, so it can just use the acpi_table_parse_entries_array() helper to walk the CFMWS like the othe subtable walkers in acpi_numa_init(). You would need to update the subtable helpers (acpi_get_subtable_type() et al) to recognize the CEDT case. [ Side note for the implications of acpi_table_parse_entries_array() for drivers/cxl/acpi.c ] Rafael, both the NFIT driver and now the CXL ACPI driver have open coded subtable parsing. Any philosophical reason to keep the subtable parsing code as __init? It can still be __init and thrown away if those drivers are not build-time enabled. > + struct acpi_cedt_header *c = cedt_subtable + cur; > + > + if (c->type != ACPI_CEDT_TYPE_CFMWS) > + goto next; > + > + cfmws = cedt_subtable + cur; > + if (cfmws->header.length < sizeof(*cfmws)) { > + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", > + cfmws->header.length); > + goto next; > + } > + > + start = cfmws->base_hpa; > + end = cfmws->base_hpa + cfmws->window_size; > + > + /* > + * Skip if the SRAT already described > + * the NUMA details for this HPA. > + */ > + node = phys_to_target_node(start); > + if (node != NUMA_NO_NODE) > + goto next; > + > + node = acpi_map_pxm_to_node(pxm); > + if (node == NUMA_NO_NODE) { > + pr_err("ACPI NUMA: Too many proximity domains.\n"); I would add "while processing CFMWS" to make it clear that the BIOS technically did not declare too many PXMs; it was the Linux heuristic for opportunistically emulating more PXMs. > + return -EINVAL; > + } > + > + if (numa_add_memblk(node, start, end) < 0) { > + /* CXL driver must handle the NUMA_NO_NODE case */ > + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", > + node, start, end); > + } > + pxm++; > +next: > + cur += c->length; > + } > + return 0; > +} > + > static int __init acpi_parse_slit(struct acpi_table_header *table) > { > struct acpi_table_slit *slit = (struct acpi_table_slit *)table; > @@ -478,6 +539,15 @@ int __init acpi_numa_init(void) > /* SLIT: System Locality Information Table */ > acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); > > + /* > + * CEDT: CXL Fixed Memory Window Structures (CFMWS) > + * must be parsed after the SRAT. It creates NUMA > + * Nodes for CXL memory ranges not already defined > + * in the SRAT and it assigns PXMs after the max PXM > + * defined in the SRAT. > + */ > + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); > + > if (cnt < 0) > return cnt; > else if (!parsed_numa_memblks) > diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c > index 10120e4e0b9f..ccf73f0a59a7 100644 > --- a/drivers/cxl/acpi.c > +++ b/drivers/cxl/acpi.c > @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct device *dev, > cfmws->base_hpa, cfmws->base_hpa + > cfmws->window_size - 1); > } else { > - dev_dbg(dev, "add: %s range %#llx-%#llx\n", > - dev_name(&cxld->dev), cfmws->base_hpa, > - cfmws->base_hpa + cfmws->window_size - 1); > + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", > + dev_name(&cxld->dev), > + phys_to_target_node(cxld->range.start), > + cfmws->base_hpa, > + cfmws->base_hpa + cfmws->window_size - 1); > } > cur += c->length; > } > diff --git a/include/linux/acpi.h b/include/linux/acpi.h > index 974d497a897d..f837fd715440 100644 > --- a/include/linux/acpi.h > +++ b/include/linux/acpi.h > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); > #ifdef CONFIG_ACPI_NUMA > int acpi_map_pxm_to_node(int pxm); > int acpi_get_node(acpi_handle handle); > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); This doesn't belong here. There is already a declaration for this in arch/x86/include/asm/numa.h. I think what you are missing is that your new code needs to be within the same ifdef guards as the other helpers in srat.c that call numa_add_memblk(). See the line that has: #if defined(CONFIG_X86) || defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) ...above acpi_numa_slit_init()
On Tue, Oct 26, 2021 at 3:09 AM Dan Williams <dan.j.williams@intel.com> wrote: > > On Wed, Oct 20, 2021 at 8:27 AM Rafael J. Wysocki <rafael@kernel.org> wrote: > > > > On Tue, Oct 19, 2021 at 7:01 AM <alison.schofield@intel.com> wrote: > > > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > > > subtable may be assigned to a NUMA node. Since there is no > > > requirement that the SRAT be comprehensive for CXL memory another > > > mechanism is needed to assign NUMA nodes to CXL memory not identified > > > in the SRAT. > > > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > > > Early Discovery Table (CEDT) to find all CXL memory ranges. > > > Create a NUMA node for each CFMWS that is not already assigned to > > > a NUMA node. Add a memblk attaching its host physical address > > > range to the node. > > > > > > Note that these ranges may not actually map any memory at boot time. > > > They may describe persistent capacity or may be present to enable > > > hot-plug. > > > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > > Dan, this seems to fall into your territory. > > Rafael, > > Sure, I'm happy to take this through the CXL tree with your ack. So please feel free to add my ACK to this patch, thanks!
On Mon, Oct 25, 2021 at 07:47:32PM -0700, Dan Williams wrote: > On Mon, Oct 18, 2021 at 10:01 PM <alison.schofield@intel.com> wrote: > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > During NUMA init, CXL memory defined in the SRAT Memory Affinity > > subtable may be assigned to a NUMA node. Since there is no > > requirement that the SRAT be comprehensive for CXL memory another > > mechanism is needed to assign NUMA nodes to CXL memory not identified > > in the SRAT. > > > > Use the CXL Fixed Memory Window Structure (CFMWS) of the ACPI CXL > > Early Discovery Table (CEDT) to find all CXL memory ranges. > > Create a NUMA node for each CFMWS that is not already assigned to > > a NUMA node. Add a memblk attaching its host physical address > > range to the node. > > > > Note that these ranges may not actually map any memory at boot time. > > They may describe persistent capacity or may be present to enable > > hot-plug. > > > > Consumers can use phys_to_target_node() to discover the NUMA node. > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > --- The next version of this patch is now included in this patchset that adds helpers for parsing the CEDT subtables: https://lore.kernel.org/linux-cxl/163553711933.2509508.2203471175679990.stgit@dwillia2-desk3.amr.corp.intel.com/T/#mf40d84e1ad4c01f69f794d591b07774255993185 It addresses Dan's comments below. >> snip > > +{ > > + struct acpi_cedt_cfmws *cfmws; > > + acpi_size len, cur = 0; > > + int i, node, pxm = 0; > > Shouldn't this be -1, on the idea that the first numa node to assign > if none are set is zero? > > I don't think the way you have it is a problem in practice because > SRAT should always be there in a NUMA system. However, the first CFMWS > pxm should start after the last SRAT entry, or 0 if no SRAT entries. > > > + void *cedt_subtable; > > + u64 start, end; > > + > > + /* Find the max PXM defined in the SRAT */ > > + for (i = 0; i < MAX_NUMNODES - 1; i++) { > > How about: > > for (i = 0, pxm = -1; i < MAX_NUMNODES -1; i++) > > ...just to keep the initialization close to the use, but that's just a > personal style preference. Done. > > > + if (node_to_pxm_map[i] > pxm) > > + pxm = node_to_pxm_map[i]; > > + } > > + /* Start assigning fake PXM values after the SRAT max PXM */ > > + pxm++; > > + > > + len = acpi_cedt->length - sizeof(*acpi_cedt); > > + cedt_subtable = acpi_cedt + 1; > > + > > + while (cur < len) { > > Similarly to above I wonder if this would be cleaner as a for loop > then you could use typical "continue" statements rather than goto. I > took a stab at creating a for_each_cedt() helper which ended up a > decent cleanup for drivers/cxl/ > > drivers/cxl/acpi.c | 48 +++++++++++++++--------------------------------- > 1 file changed, 15 insertions(+), 33 deletions(-) > > ...however, I just realized this NUMA code is running at init time, so > it can just use the acpi_table_parse_entries_array() helper to walk > the CFMWS like the othe subtable walkers in acpi_numa_init(). You > would need to update the subtable helpers (acpi_get_subtable_type() et > al) to recognize the CEDT case. > > [ Side note for the implications of acpi_table_parse_entries_array() > for drivers/cxl/acpi.c ] > > Rafael, both the NFIT driver and now the CXL ACPI driver have open > coded subtable parsing. Any philosophical reason to keep the subtable > parsing code as __init? It can still be __init and thrown away if > those drivers are not build-time enabled. > The updated patch (in the greater patchset) now uses the new helpers. > snip > > + node = acpi_map_pxm_to_node(pxm); > > + if (node == NUMA_NO_NODE) { > > + pr_err("ACPI NUMA: Too many proximity domains.\n"); > > I would add "while processing CFMWS" to make it clear that the BIOS > technically did not declare too many PXMs; it was the Linux heuristic > for opportunistically emulating more PXMs. > Done. > > snip > > } > > diff --git a/include/linux/acpi.h b/include/linux/acpi.h > > index 974d497a897d..f837fd715440 100644 > > --- a/include/linux/acpi.h > > +++ b/include/linux/acpi.h > > @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); > > #ifdef CONFIG_ACPI_NUMA > > int acpi_map_pxm_to_node(int pxm); > > int acpi_get_node(acpi_handle handle); > > +int __init numa_add_memblk(int nodeid, u64 start, u64 end); > > This doesn't belong here. > > There is already a declaration for this in > arch/x86/include/asm/numa.h. I think what you are missing is that your > new code needs to be within the same ifdef guards as the other helpers > in srat.c that call numa_add_memblk(). See the line that has: > > #if defined(CONFIG_X86) || defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) > > ...above acpi_numa_slit_init() Done.
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index b8795fc49097..daaaef58f1e6 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -300,6 +300,67 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) } #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ +static int __init acpi_cxl_cfmws_init(struct acpi_table_header *acpi_cedt) +{ + struct acpi_cedt_cfmws *cfmws; + acpi_size len, cur = 0; + int i, node, pxm = 0; + void *cedt_subtable; + u64 start, end; + + /* Find the max PXM defined in the SRAT */ + for (i = 0; i < MAX_NUMNODES - 1; i++) { + if (node_to_pxm_map[i] > pxm) + pxm = node_to_pxm_map[i]; + } + /* Start assigning fake PXM values after the SRAT max PXM */ + pxm++; + + len = acpi_cedt->length - sizeof(*acpi_cedt); + cedt_subtable = acpi_cedt + 1; + + while (cur < len) { + struct acpi_cedt_header *c = cedt_subtable + cur; + + if (c->type != ACPI_CEDT_TYPE_CFMWS) + goto next; + + cfmws = cedt_subtable + cur; + if (cfmws->header.length < sizeof(*cfmws)) { + pr_warn_once("CFMWS entry skipped:invalid length:%u\n", + cfmws->header.length); + goto next; + } + + start = cfmws->base_hpa; + end = cfmws->base_hpa + cfmws->window_size; + + /* + * Skip if the SRAT already described + * the NUMA details for this HPA. + */ + node = phys_to_target_node(start); + if (node != NUMA_NO_NODE) + goto next; + + node = acpi_map_pxm_to_node(pxm); + if (node == NUMA_NO_NODE) { + pr_err("ACPI NUMA: Too many proximity domains.\n"); + return -EINVAL; + } + + if (numa_add_memblk(node, start, end) < 0) { + /* CXL driver must handle the NUMA_NO_NODE case */ + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", + node, start, end); + } + pxm++; +next: + cur += c->length; + } + return 0; +} + static int __init acpi_parse_slit(struct acpi_table_header *table) { struct acpi_table_slit *slit = (struct acpi_table_slit *)table; @@ -478,6 +539,15 @@ int __init acpi_numa_init(void) /* SLIT: System Locality Information Table */ acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); + /* + * CEDT: CXL Fixed Memory Window Structures (CFMWS) + * must be parsed after the SRAT. It creates NUMA + * Nodes for CXL memory ranges not already defined + * in the SRAT and it assigns PXMs after the max PXM + * defined in the SRAT. + */ + acpi_table_parse(ACPI_SIG_CEDT, acpi_cxl_cfmws_init); + if (cnt < 0) return cnt; else if (!parsed_numa_memblks) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 10120e4e0b9f..ccf73f0a59a7 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -122,9 +122,11 @@ static void cxl_add_cfmws_decoders(struct device *dev, cfmws->base_hpa, cfmws->base_hpa + cfmws->window_size - 1); } else { - dev_dbg(dev, "add: %s range %#llx-%#llx\n", - dev_name(&cxld->dev), cfmws->base_hpa, - cfmws->base_hpa + cfmws->window_size - 1); + dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", + dev_name(&cxld->dev), + phys_to_target_node(cxld->range.start), + cfmws->base_hpa, + cfmws->base_hpa + cfmws->window_size - 1); } cur += c->length; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..f837fd715440 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -426,6 +426,7 @@ extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); +int __init numa_add_memblk(int nodeid, u64 start, u64 end); /** * pxm_to_online_node - Map proximity ID to online node