Message ID | 9fcc548a6b4727cb2538e5227d7bad2e94e6adaf.1686712819.git.alison.schofield@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | CXL: Apply SRAT defined PXM to entire CFMWS window | expand |
On 6/14/2023 6:35 AM, alison.schofield@intel.com wrote: > From: Alison Schofield <alison.schofield@intel.com> > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > over an HPA address range. > > The ACPI driver will use numa_fill_memblks() to implement a new Linux > policy that prescribes extending proximity domains in a portion of a > CFMWS window to the entire window. > > Dan Williams offered this explanation of the policy: > A CFWMS is an ACPI data structure that indicates *potential* locations > where CXL memory can be placed. It is the playground where the CXL > driver has free reign to establish regions. That space can be populated > by BIOS created regions, or driver created regions, after hotplug or > other reconfiguration. > > When BIOS creates a region in a CXL Window it additionally describes > that subset of the Window range in the other typical ACPI tables SRAT, > SLIT, and HMAT. The rationale for BIOS not pre-describing the entire > CXL Window in SRAT, SLIT, and HMAT is that it can not predict the > future. I.e. there is nothing stopping higher or lower performance > devices being placed in the same Window. Compare that to ACPI memory > hotplug that just onlines additional capacity in the proximity domain > with little freedom for dynamic performance differentiation. > > That leaves the OS with a choice, should unpopulated window capacity > match the proximity domain of an existing region, or should it allocate > a new one? This patch takes the simple position of minimizing proximity > domain proliferation by reusing any proximity domain intersection for > the entire Window. If the Window has no intersections then allocate a > new proximity domain. Note that SRAT, SLIT and HMAT information can be > enumerated dynamically in a standard way from device provided data. > Think of CXL as the end of ACPI needing to describe memory attributes, > CXL offers a standard discovery model for performance attributes, but > Linux still needs to interoperate with the old regime. > > Reported-by: Derick Marks <derick.w.marks@intel.com> > Suggested-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > Tested-by: Derick Marks <derick.w.marks@intel.com> > --- > arch/x86/include/asm/sparsemem.h | 2 + > arch/x86/mm/numa.c | 87 ++++++++++++++++++++++++++++++++ > include/linux/numa.h | 7 +++ > 3 files changed, 96 insertions(+) > > diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h > index 64df897c0ee3..1be13b2dfe8b 100644 > --- a/arch/x86/include/asm/sparsemem.h > +++ b/arch/x86/include/asm/sparsemem.h > @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); > #define phys_to_target_node phys_to_target_node > extern int memory_add_physaddr_to_nid(u64 start); > #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid > +extern int numa_fill_memblks(u64 start, u64 end); > +#define numa_fill_memblks numa_fill_memblks > #endif > #endif /* __ASSEMBLY__ */ > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > index 2aadb2019b4f..fa82141d1a04 100644 > --- a/arch/x86/mm/numa.c > +++ b/arch/x86/mm/numa.c > @@ -11,6 +11,7 @@ > #include <linux/nodemask.h> > #include <linux/sched.h> > #include <linux/topology.h> > +#include <linux/sort.h> > > #include <asm/e820/api.h> > #include <asm/proto.h> > @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) > return nid; > } > EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); > + > +static int __init cmp_memblk(const void *a, const void *b) > +{ > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; Is this casting necessary ? > + > + if (ma->start != mb->start) > + return (ma->start < mb->start) ? -1 : 1; > + > + /* Caller handles duplicate start addresses */ > + return 0; > +} > + > +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; > + > +/** > + * numa_fill_memblks - Fill gaps in numa_meminfo memblks > + * @start: address to begin fill > + * @end: address to end fill > + * > + * Find and extend numa_meminfo memblks to cover the @start-@end > + * HPA address range, such that the first memblk includes @start, > + * the last memblk includes @end, and any gaps in between are > + * filled. > + * > + * RETURNS: > + * 0 : Success > + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range > + */ > + > +int __init numa_fill_memblks(u64 start, u64 end) > +{ > + struct numa_memblk **blk = &numa_memblk_list[0]; > + struct numa_meminfo *mi = &numa_meminfo; > + int count = 0; > + u64 prev_end; > + > + /* > + * Create a list of pointers to numa_meminfo memblks that > + * overlap start, end. Exclude (start == bi->end) since > + * end addresses in both a CFMWS range and a memblk range > + * are exclusive. > + * > + * This list of pointers is used to make in-place changes > + * that fill out the numa_meminfo memblks. > + */ > + for (int i = 0; i < mi->nr_blks; i++) { > + struct numa_memblk *bi = &mi->blk[i]; > + > + if (start < bi->end && end >= bi->start) { > + blk[count] = &mi->blk[i]; > + count++; > + } > + } > + if (!count) > + return NUMA_NO_MEMBLK; > + > + /* Sort the list of pointers in memblk->start order */ > + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); > + > + /* Make sure the first/last memblks include start/end */ > + blk[0]->start = min(blk[0]->start, start); > + blk[count - 1]->end = max(blk[count - 1]->end, end); > + > + /* > + * Fill any gaps by tracking the previous memblks end address, > + * prev_end, and backfilling to it if needed. Avoid filling > + * overlapping memblks by making prev_end monotonically non- > + * decreasing. > + */ > + prev_end = blk[0]->end; > + for (int i = 1; i < count; i++) { > + struct numa_memblk *curr = blk[i]; > + > + if (prev_end >= curr->start) { > + if (prev_end < curr->end) > + prev_end = curr->end; > + } else { > + curr->start = prev_end; > + prev_end = curr->end; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(numa_fill_memblks); > + > #endif > diff --git a/include/linux/numa.h b/include/linux/numa.h > index 59df211d051f..0f512c0aba54 100644 > --- a/include/linux/numa.h > +++ b/include/linux/numa.h > @@ -12,6 +12,7 @@ > #define MAX_NUMNODES (1 << NODES_SHIFT) > > #define NUMA_NO_NODE (-1) > +#define NUMA_NO_MEMBLK (-1) Same error code as NUMA_NO_NODE ? > > /* optionally keep NUMA memory info available post init */ > #ifdef CONFIG_NUMA_KEEP_MEMINFO > @@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start) > return 0; > } > #endif > +#ifndef numa_fill_memblks Why not just #ifndef CONFIG_NUMA_KEEP_MEMINFO ? > +static inline int __init numa_fill_memblks(u64 start, u64 end) > +{ > + return NUMA_NO_MEMBLK; > +} > +#endif > #else /* !CONFIG_NUMA */ > static inline int numa_map_to_online_node(int node) > {
Wilczynski, Michal wrote: > > > On 6/14/2023 6:35 AM, alison.schofield@intel.com wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > > over an HPA address range. > > > > The ACPI driver will use numa_fill_memblks() to implement a new Linux > > policy that prescribes extending proximity domains in a portion of a > > CFMWS window to the entire window. > > > > Dan Williams offered this explanation of the policy: > > A CFWMS is an ACPI data structure that indicates *potential* locations > > where CXL memory can be placed. It is the playground where the CXL > > driver has free reign to establish regions. That space can be populated > > by BIOS created regions, or driver created regions, after hotplug or > > other reconfiguration. > > > > When BIOS creates a region in a CXL Window it additionally describes > > that subset of the Window range in the other typical ACPI tables SRAT, > > SLIT, and HMAT. The rationale for BIOS not pre-describing the entire > > CXL Window in SRAT, SLIT, and HMAT is that it can not predict the > > future. I.e. there is nothing stopping higher or lower performance > > devices being placed in the same Window. Compare that to ACPI memory > > hotplug that just onlines additional capacity in the proximity domain > > with little freedom for dynamic performance differentiation. > > > > That leaves the OS with a choice, should unpopulated window capacity > > match the proximity domain of an existing region, or should it allocate > > a new one? This patch takes the simple position of minimizing proximity > > domain proliferation by reusing any proximity domain intersection for > > the entire Window. If the Window has no intersections then allocate a > > new proximity domain. Note that SRAT, SLIT and HMAT information can be > > enumerated dynamically in a standard way from device provided data. > > Think of CXL as the end of ACPI needing to describe memory attributes, > > CXL offers a standard discovery model for performance attributes, but > > Linux still needs to interoperate with the old regime. > > > > Reported-by: Derick Marks <derick.w.marks@intel.com> > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > Tested-by: Derick Marks <derick.w.marks@intel.com> > > --- > > arch/x86/include/asm/sparsemem.h | 2 + > > arch/x86/mm/numa.c | 87 ++++++++++++++++++++++++++++++++ > > include/linux/numa.h | 7 +++ > > 3 files changed, 96 insertions(+) > > > > diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h > > index 64df897c0ee3..1be13b2dfe8b 100644 > > --- a/arch/x86/include/asm/sparsemem.h > > +++ b/arch/x86/include/asm/sparsemem.h > > @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); > > #define phys_to_target_node phys_to_target_node > > extern int memory_add_physaddr_to_nid(u64 start); > > #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid > > +extern int numa_fill_memblks(u64 start, u64 end); > > +#define numa_fill_memblks numa_fill_memblks > > #endif > > #endif /* __ASSEMBLY__ */ > > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > > index 2aadb2019b4f..fa82141d1a04 100644 > > --- a/arch/x86/mm/numa.c > > +++ b/arch/x86/mm/numa.c > > @@ -11,6 +11,7 @@ > > #include <linux/nodemask.h> > > #include <linux/sched.h> > > #include <linux/topology.h> > > +#include <linux/sort.h> > > > > #include <asm/e820/api.h> > > #include <asm/proto.h> > > @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) > > return nid; > > } > > EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); > > + > > +static int __init cmp_memblk(const void *a, const void *b) > > +{ > > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; > > Is this casting necessary ? This is idiomatic for sort() comparison handlers. > > + > > + if (ma->start != mb->start) > > + return (ma->start < mb->start) ? -1 : 1; > > + > > + /* Caller handles duplicate start addresses */ > > + return 0; > > +} > > + > > +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; > > + > > +/** > > + * numa_fill_memblks - Fill gaps in numa_meminfo memblks > > + * @start: address to begin fill > > + * @end: address to end fill > > + * > > + * Find and extend numa_meminfo memblks to cover the @start-@end > > + * HPA address range, such that the first memblk includes @start, > > + * the last memblk includes @end, and any gaps in between are > > + * filled. > > + * > > + * RETURNS: > > + * 0 : Success > > + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range > > + */ > > + > > +int __init numa_fill_memblks(u64 start, u64 end) > > +{ > > + struct numa_memblk **blk = &numa_memblk_list[0]; > > + struct numa_meminfo *mi = &numa_meminfo; > > + int count = 0; > > + u64 prev_end; > > + > > + /* > > + * Create a list of pointers to numa_meminfo memblks that > > + * overlap start, end. Exclude (start == bi->end) since > > + * end addresses in both a CFMWS range and a memblk range > > + * are exclusive. > > + * > > + * This list of pointers is used to make in-place changes > > + * that fill out the numa_meminfo memblks. > > + */ > > + for (int i = 0; i < mi->nr_blks; i++) { > > + struct numa_memblk *bi = &mi->blk[i]; > > + > > + if (start < bi->end && end >= bi->start) { > > + blk[count] = &mi->blk[i]; > > + count++; > > + } > > + } > > + if (!count) > > + return NUMA_NO_MEMBLK; > > + > > + /* Sort the list of pointers in memblk->start order */ > > + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); > > + > > + /* Make sure the first/last memblks include start/end */ > > + blk[0]->start = min(blk[0]->start, start); > > + blk[count - 1]->end = max(blk[count - 1]->end, end); > > + > > + /* > > + * Fill any gaps by tracking the previous memblks end address, > > + * prev_end, and backfilling to it if needed. Avoid filling > > + * overlapping memblks by making prev_end monotonically non- > > + * decreasing. > > + */ > > + prev_end = blk[0]->end; > > + for (int i = 1; i < count; i++) { > > + struct numa_memblk *curr = blk[i]; > > + > > + if (prev_end >= curr->start) { > > + if (prev_end < curr->end) > > + prev_end = curr->end; > > + } else { > > + curr->start = prev_end; > > + prev_end = curr->end; > > + } > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(numa_fill_memblks); > > + > > #endif > > diff --git a/include/linux/numa.h b/include/linux/numa.h > > index 59df211d051f..0f512c0aba54 100644 > > --- a/include/linux/numa.h > > +++ b/include/linux/numa.h > > @@ -12,6 +12,7 @@ > > #define MAX_NUMNODES (1 << NODES_SHIFT) > > > > #define NUMA_NO_NODE (-1) > > +#define NUMA_NO_MEMBLK (-1) > > Same error code as NUMA_NO_NODE ? > > > > > /* optionally keep NUMA memory info available post init */ > > #ifdef CONFIG_NUMA_KEEP_MEMINFO > > @@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start) > > return 0; > > } > > #endif > > +#ifndef numa_fill_memblks > > Why not just #ifndef CONFIG_NUMA_KEEP_MEMINFO ? This is due to the fact that multiple archs use CONFIG_NUMA_KEEP_MEMINFO (x86, ARM64, LOONGARCH), but only one supplies a numa_fill_memblks() implementation (x86).
alison.schofield@ wrote: > From: Alison Schofield <alison.schofield@intel.com> > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > over an HPA address range. > > The ACPI driver will use numa_fill_memblks() to implement a new Linux > policy that prescribes extending proximity domains in a portion of a > CFMWS window to the entire window. > > Dan Williams offered this explanation of the policy: > A CFWMS is an ACPI data structure that indicates *potential* locations > where CXL memory can be placed. It is the playground where the CXL > driver has free reign to establish regions. That space can be populated > by BIOS created regions, or driver created regions, after hotplug or > other reconfiguration. > > When BIOS creates a region in a CXL Window it additionally describes > that subset of the Window range in the other typical ACPI tables SRAT, > SLIT, and HMAT. The rationale for BIOS not pre-describing the entire > CXL Window in SRAT, SLIT, and HMAT is that it can not predict the > future. I.e. there is nothing stopping higher or lower performance > devices being placed in the same Window. Compare that to ACPI memory > hotplug that just onlines additional capacity in the proximity domain > with little freedom for dynamic performance differentiation. > > That leaves the OS with a choice, should unpopulated window capacity > match the proximity domain of an existing region, or should it allocate > a new one? This patch takes the simple position of minimizing proximity > domain proliferation by reusing any proximity domain intersection for > the entire Window. If the Window has no intersections then allocate a > new proximity domain. Note that SRAT, SLIT and HMAT information can be > enumerated dynamically in a standard way from device provided data. > Think of CXL as the end of ACPI needing to describe memory attributes, > CXL offers a standard discovery model for performance attributes, but > Linux still needs to interoperate with the old regime. > > Reported-by: Derick Marks <derick.w.marks@intel.com> > Suggested-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > Tested-by: Derick Marks <derick.w.marks@intel.com> [..] > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > index 2aadb2019b4f..fa82141d1a04 100644 > --- a/arch/x86/mm/numa.c > +++ b/arch/x86/mm/numa.c [..] > @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) > return nid; > } > EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); > + > +static int __init cmp_memblk(const void *a, const void *b) > +{ > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; > + > + if (ma->start != mb->start) > + return (ma->start < mb->start) ? -1 : 1; > + > + /* Caller handles duplicate start addresses */ > + return 0; This can be simplified to: static int __init cmp_memblk(const void *a, const void *b) { const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; return ma->start - mb->start; } > +} > + > +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; > + > +/** > + * numa_fill_memblks - Fill gaps in numa_meminfo memblks > + * @start: address to begin fill > + * @end: address to end fill > + * > + * Find and extend numa_meminfo memblks to cover the @start-@end > + * HPA address range, such that the first memblk includes @start, > + * the last memblk includes @end, and any gaps in between are > + * filled. > + * > + * RETURNS: > + * 0 : Success > + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range > + */ > + > +int __init numa_fill_memblks(u64 start, u64 end) > +{ > + struct numa_memblk **blk = &numa_memblk_list[0]; > + struct numa_meminfo *mi = &numa_meminfo; > + int count = 0; > + u64 prev_end; > + > + /* > + * Create a list of pointers to numa_meminfo memblks that > + * overlap start, end. Exclude (start == bi->end) since > + * end addresses in both a CFMWS range and a memblk range > + * are exclusive. > + * > + * This list of pointers is used to make in-place changes > + * that fill out the numa_meminfo memblks. > + */ Thanks for this comment, looks good. > + for (int i = 0; i < mi->nr_blks; i++) { > + struct numa_memblk *bi = &mi->blk[i]; > + > + if (start < bi->end && end >= bi->start) { > + blk[count] = &mi->blk[i]; > + count++; > + } > + } > + if (!count) > + return NUMA_NO_MEMBLK; > + > + /* Sort the list of pointers in memblk->start order */ > + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); > + > + /* Make sure the first/last memblks include start/end */ > + blk[0]->start = min(blk[0]->start, start); > + blk[count - 1]->end = max(blk[count - 1]->end, end); > + > + /* > + * Fill any gaps by tracking the previous memblks end address, > + * prev_end, and backfilling to it if needed. Avoid filling > + * overlapping memblks by making prev_end monotonically non- > + * decreasing. I am not immediately understanding the use of the term monotonically non-decreasing here? I think the first sentence of this comment is enough, or am I missing a nuance? > + */ > + prev_end = blk[0]->end; > + for (int i = 1; i < count; i++) { > + struct numa_memblk *curr = blk[i]; > + > + if (prev_end >= curr->start) { > + if (prev_end < curr->end) > + prev_end = curr->end; > + } else { > + curr->start = prev_end; > + prev_end = curr->end; > + } > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(numa_fill_memblks); This export is not needed. The only caller of this is drivers/acpi/numa/srat.c which is only ever built-in, not a module.
On Wed, Jun 14, 2023 at 05:44:19AM -0700, Dan Williams wrote: > alison.schofield@ wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > > over an HPA address range. > > > > The ACPI driver will use numa_fill_memblks() to implement a new Linux > > policy that prescribes extending proximity domains in a portion of a > > CFMWS window to the entire window. > > > > Dan Williams offered this explanation of the policy: > > A CFWMS is an ACPI data structure that indicates *potential* locations > > where CXL memory can be placed. It is the playground where the CXL > > driver has free reign to establish regions. That space can be populated > > by BIOS created regions, or driver created regions, after hotplug or > > other reconfiguration. > > > > When BIOS creates a region in a CXL Window it additionally describes > > that subset of the Window range in the other typical ACPI tables SRAT, > > SLIT, and HMAT. The rationale for BIOS not pre-describing the entire > > CXL Window in SRAT, SLIT, and HMAT is that it can not predict the > > future. I.e. there is nothing stopping higher or lower performance > > devices being placed in the same Window. Compare that to ACPI memory > > hotplug that just onlines additional capacity in the proximity domain > > with little freedom for dynamic performance differentiation. > > > > That leaves the OS with a choice, should unpopulated window capacity > > match the proximity domain of an existing region, or should it allocate > > a new one? This patch takes the simple position of minimizing proximity > > domain proliferation by reusing any proximity domain intersection for > > the entire Window. If the Window has no intersections then allocate a > > new proximity domain. Note that SRAT, SLIT and HMAT information can be > > enumerated dynamically in a standard way from device provided data. > > Think of CXL as the end of ACPI needing to describe memory attributes, > > CXL offers a standard discovery model for performance attributes, but > > Linux still needs to interoperate with the old regime. > > > > Reported-by: Derick Marks <derick.w.marks@intel.com> > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > Tested-by: Derick Marks <derick.w.marks@intel.com> > [..] > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > > index 2aadb2019b4f..fa82141d1a04 100644 > > --- a/arch/x86/mm/numa.c > > +++ b/arch/x86/mm/numa.c > [..] > > @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) > > return nid; > > } > > EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); > > + > > +static int __init cmp_memblk(const void *a, const void *b) > > +{ > > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; > > + > > + if (ma->start != mb->start) > > + return (ma->start < mb->start) ? -1 : 1; > > + > > + /* Caller handles duplicate start addresses */ > > + return 0; > > This can be simplified to: > > static int __init cmp_memblk(const void *a, const void *b) > { > const struct numa_memblk *ma = *(const struct numa_memblk **)a; > const struct numa_memblk *mb = *(const struct numa_memblk **)b; > > return ma->start - mb->start; > } Got it. > > > +} > > + > > +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; > > + > > +/** > > + * numa_fill_memblks - Fill gaps in numa_meminfo memblks > > + * @start: address to begin fill > > + * @end: address to end fill > > + * > > + * Find and extend numa_meminfo memblks to cover the @start-@end > > + * HPA address range, such that the first memblk includes @start, > > + * the last memblk includes @end, and any gaps in between are > > + * filled. > > + * > > + * RETURNS: > > + * 0 : Success > > + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range > > + */ > > + > > +int __init numa_fill_memblks(u64 start, u64 end) > > +{ > > + struct numa_memblk **blk = &numa_memblk_list[0]; > > + struct numa_meminfo *mi = &numa_meminfo; > > + int count = 0; > > + u64 prev_end; > > + > > + /* > > + * Create a list of pointers to numa_meminfo memblks that > > + * overlap start, end. Exclude (start == bi->end) since > > + * end addresses in both a CFMWS range and a memblk range > > + * are exclusive. > > + * > > + * This list of pointers is used to make in-place changes > > + * that fill out the numa_meminfo memblks. > > + */ > > Thanks for this comment, looks good. > > > + for (int i = 0; i < mi->nr_blks; i++) { > > + struct numa_memblk *bi = &mi->blk[i]; > > + > > + if (start < bi->end && end >= bi->start) { > > + blk[count] = &mi->blk[i]; > > + count++; > > + } > > + } > > + if (!count) > > + return NUMA_NO_MEMBLK; > > + > > + /* Sort the list of pointers in memblk->start order */ > > + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); > > + > > + /* Make sure the first/last memblks include start/end */ > > + blk[0]->start = min(blk[0]->start, start); > > + blk[count - 1]->end = max(blk[count - 1]->end, end); > > + > > + /* > > + * Fill any gaps by tracking the previous memblks end address, > > + * prev_end, and backfilling to it if needed. Avoid filling > > + * overlapping memblks by making prev_end monotonically non- > > + * decreasing. > > I am not immediately understanding the use of the term monotonically > non-decreasing here? I think the first sentence of this comment is > enough, or am I missing a nuance? Not sure it's a nuance, but if we advanced prev_end to be curr_end at each iteration, gaps get needlessly filled, when curr is wholly contained in prev. So the 'monotonically non-decreasing' comment is emphasizing that prev-end will either stay the same or increase at each iteration. > > > + */ > > + prev_end = blk[0]->end; > > + for (int i = 1; i < count; i++) { > > + struct numa_memblk *curr = blk[i]; > > + > > + if (prev_end >= curr->start) { > > + if (prev_end < curr->end) > > + prev_end = curr->end; > > + } else { > > + curr->start = prev_end; > > + prev_end = curr->end; > > + } > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(numa_fill_memblks); > > This export is not needed. The only caller of this is > drivers/acpi/numa/srat.c which is only ever built-in, not a module. Got it. Thanks for the review Dan and for helping address other reviewer comments. Alison
On Wed, Jun 14, 2023 at 09:35:22AM +0200, Wilczynski, Michal wrote: > > > On 6/14/2023 6:35 AM, alison.schofield@intel.com wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > > over an HPA address range. > > > > The ACPI driver will use numa_fill_memblks() to implement a new Linux > > policy that prescribes extending proximity domains in a portion of a > > CFMWS window to the entire window. > > > > Dan Williams offered this explanation of the policy: > > A CFWMS is an ACPI data structure that indicates *potential* locations > > where CXL memory can be placed. It is the playground where the CXL > > driver has free reign to establish regions. That space can be populated > > by BIOS created regions, or driver created regions, after hotplug or > > other reconfiguration. > > > > When BIOS creates a region in a CXL Window it additionally describes > > that subset of the Window range in the other typical ACPI tables SRAT, > > SLIT, and HMAT. The rationale for BIOS not pre-describing the entire > > CXL Window in SRAT, SLIT, and HMAT is that it can not predict the > > future. I.e. there is nothing stopping higher or lower performance > > devices being placed in the same Window. Compare that to ACPI memory > > hotplug that just onlines additional capacity in the proximity domain > > with little freedom for dynamic performance differentiation. > > > > That leaves the OS with a choice, should unpopulated window capacity > > match the proximity domain of an existing region, or should it allocate > > a new one? This patch takes the simple position of minimizing proximity > > domain proliferation by reusing any proximity domain intersection for > > the entire Window. If the Window has no intersections then allocate a > > new proximity domain. Note that SRAT, SLIT and HMAT information can be > > enumerated dynamically in a standard way from device provided data. > > Think of CXL as the end of ACPI needing to describe memory attributes, > > CXL offers a standard discovery model for performance attributes, but > > Linux still needs to interoperate with the old regime. > > > > Reported-by: Derick Marks <derick.w.marks@intel.com> > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > Tested-by: Derick Marks <derick.w.marks@intel.com> > > --- > > arch/x86/include/asm/sparsemem.h | 2 + > > arch/x86/mm/numa.c | 87 ++++++++++++++++++++++++++++++++ > > include/linux/numa.h | 7 +++ > > 3 files changed, 96 insertions(+) > > > > diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h > > index 64df897c0ee3..1be13b2dfe8b 100644 > > --- a/arch/x86/include/asm/sparsemem.h > > +++ b/arch/x86/include/asm/sparsemem.h > > @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); > > #define phys_to_target_node phys_to_target_node > > extern int memory_add_physaddr_to_nid(u64 start); > > #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid > > +extern int numa_fill_memblks(u64 start, u64 end); > > +#define numa_fill_memblks numa_fill_memblks > > #endif > > #endif /* __ASSEMBLY__ */ > > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > > index 2aadb2019b4f..fa82141d1a04 100644 > > --- a/arch/x86/mm/numa.c > > +++ b/arch/x86/mm/numa.c > > @@ -11,6 +11,7 @@ > > #include <linux/nodemask.h> > > #include <linux/sched.h> > > #include <linux/topology.h> > > +#include <linux/sort.h> > > > > #include <asm/e820/api.h> > > #include <asm/proto.h> > > @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) > > return nid; > > } > > EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); > > + > > +static int __init cmp_memblk(const void *a, const void *b) > > +{ > > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; > > Is this casting necessary ? Thanks for the review Michael, I found the cast to be necessary. Sort passes pointers to the array elements to compare, even if they are already pointers, so cmp_ gets a double pointer. > > > + > > + if (ma->start != mb->start) > > + return (ma->start < mb->start) ? -1 : 1; > > + > > + /* Caller handles duplicate start addresses */ > > + return 0; > > +} > > + > > +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; > > + > > +/** > > + * numa_fill_memblks - Fill gaps in numa_meminfo memblks > > + * @start: address to begin fill > > + * @end: address to end fill > > + * > > + * Find and extend numa_meminfo memblks to cover the @start-@end > > + * HPA address range, such that the first memblk includes @start, > > + * the last memblk includes @end, and any gaps in between are > > + * filled. > > + * > > + * RETURNS: > > + * 0 : Success > > + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range > > + */ > > + > > +int __init numa_fill_memblks(u64 start, u64 end) > > +{ > > + struct numa_memblk **blk = &numa_memblk_list[0]; > > + struct numa_meminfo *mi = &numa_meminfo; > > + int count = 0; > > + u64 prev_end; > > + > > + /* > > + * Create a list of pointers to numa_meminfo memblks that > > + * overlap start, end. Exclude (start == bi->end) since > > + * end addresses in both a CFMWS range and a memblk range > > + * are exclusive. > > + * > > + * This list of pointers is used to make in-place changes > > + * that fill out the numa_meminfo memblks. > > + */ > > + for (int i = 0; i < mi->nr_blks; i++) { > > + struct numa_memblk *bi = &mi->blk[i]; > > + > > + if (start < bi->end && end >= bi->start) { > > + blk[count] = &mi->blk[i]; > > + count++; > > + } > > + } > > + if (!count) > > + return NUMA_NO_MEMBLK; > > + > > + /* Sort the list of pointers in memblk->start order */ > > + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); > > + > > + /* Make sure the first/last memblks include start/end */ > > + blk[0]->start = min(blk[0]->start, start); > > + blk[count - 1]->end = max(blk[count - 1]->end, end); > > + > > + /* > > + * Fill any gaps by tracking the previous memblks end address, > > + * prev_end, and backfilling to it if needed. Avoid filling > > + * overlapping memblks by making prev_end monotonically non- > > + * decreasing. > > + */ > > + prev_end = blk[0]->end; > > + for (int i = 1; i < count; i++) { > > + struct numa_memblk *curr = blk[i]; > > + > > + if (prev_end >= curr->start) { > > + if (prev_end < curr->end) > > + prev_end = curr->end; > > + } else { > > + curr->start = prev_end; > > + prev_end = curr->end; > > + } > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(numa_fill_memblks); > > + > > #endif > > diff --git a/include/linux/numa.h b/include/linux/numa.h > > index 59df211d051f..0f512c0aba54 100644 > > --- a/include/linux/numa.h > > +++ b/include/linux/numa.h > > @@ -12,6 +12,7 @@ > > #define MAX_NUMNODES (1 << NODES_SHIFT) > > > > #define NUMA_NO_NODE (-1) > > +#define NUMA_NO_MEMBLK (-1) > > Same error code as NUMA_NO_NODE ? > Yes. It's a define for convenience/clarity, rather than just using (-1). I could have just used NUMA_NO_NODE, since no memblk also means no node, but in a function whose job is to fill memblks, that seemed wrong. > > > > /* optionally keep NUMA memory info available post init */ > > #ifdef CONFIG_NUMA_KEEP_MEMINFO > > @@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start) > > return 0; > > } > > #endif > > +#ifndef numa_fill_memblks > > Why not just #ifndef CONFIG_NUMA_KEEP_MEMINFO ? Dan responded to this, nothing to add to that: This is due to the fact that multiple archs use CONFIG_NUMA_KEEP_MEMINFO (x86, ARM64, LOONGARCH), but only one supplies a numa_fill_memblks() implementation (x86). > > > +static inline int __init numa_fill_memblks(u64 start, u64 end) > > +{ > > + return NUMA_NO_MEMBLK; > > +} > > +#endif > > #else /* !CONFIG_NUMA */ > > static inline int numa_map_to_online_node(int node) > > { >
On Wed, Jun 14, 2023 at 05:27:39AM -0700, Dan Williams wrote: > Wilczynski, Michal wrote: > > On 6/14/2023 6:35 AM, alison.schofield@intel.com wrote: > > > +static int __init cmp_memblk(const void *a, const void *b) > > > +{ > > > + const struct numa_memblk *ma = *(const struct numa_memblk **)a; > > > + const struct numa_memblk *mb = *(const struct numa_memblk **)b; > > > > Is this casting necessary ? > > This is idiomatic for sort() comparison handlers. Aside of that, it *is* actually required, since sort() does indirect calls to it's cmp_func_t argument the Control Flow Integrity (CFI, not to be confused with Call-Frame-Information) stuff has a hard requirement that function signatures match. At the very least clang builds should warn if you do indirect calls with non-matching signatures these days. And kCFI enabled builds will get you a runtime error if you manage to ignore that warning. > > > + > > > + if (ma->start != mb->start) > > > + return (ma->start < mb->start) ? -1 : 1; > > > + > > > + /* Caller handles duplicate start addresses */ > > > + return 0; > > > +}
On Tue, Jun 13, 2023 at 09:35:24PM -0700, alison.schofield@intel.com wrote: > From: Alison Schofield <alison.schofield@intel.com> > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > over an HPA address range. What's with the Host part, should that not simply be PA ?
On Thu, Jun 15, 2023 at 10:34:07AM +0200, Peter Zijlstra wrote: > On Tue, Jun 13, 2023 at 09:35:24PM -0700, alison.schofield@intel.com wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > > > numa_fill_memblks() fills in the gaps in numa_meminfo memblks > > over an HPA address range. > > What's with the Host part, should that not simply be PA ? Yes, it should be PA. The HPA acronym usage is CXL-land language, where we qualify qualify physical addresses as either HPAs and DPAs (Device), seeping in needlessly.
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..1be13b2dfe8b 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); #define phys_to_target_node phys_to_target_node extern int memory_add_physaddr_to_nid(u64 start); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +extern int numa_fill_memblks(u64 start, u64 end); +#define numa_fill_memblks numa_fill_memblks #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2aadb2019b4f..fa82141d1a04 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -11,6 +11,7 @@ #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> +#include <linux/sort.h> #include <asm/e820/api.h> #include <asm/proto.h> @@ -961,4 +962,90 @@ int memory_add_physaddr_to_nid(u64 start) return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + if (ma->start != mb->start) + return (ma->start < mb->start) ? -1 : 1; + + /* Caller handles duplicate start addresses */ + return 0; +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the @start-@end + * HPA address range, such that the first memblk includes @start, + * the last memblk includes @end, and any gaps in between are + * filled. + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. Exclude (start == bi->end) since + * end addresses in both a CFMWS range and a memblk range + * are exclusive. + * + * This list of pointers is used to make in-place changes + * that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (start < bi->end && end >= bi->start) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks end address, + * prev_end, and backfilling to it if needed. Avoid filling + * overlapping memblks by making prev_end monotonically non- + * decreasing. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(numa_fill_memblks); + #endif diff --git a/include/linux/numa.h b/include/linux/numa.h index 59df211d051f..0f512c0aba54 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -12,6 +12,7 @@ #define MAX_NUMNODES (1 << NODES_SHIFT) #define NUMA_NO_NODE (-1) +#define NUMA_NO_MEMBLK (-1) /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO @@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start) return 0; } #endif +#ifndef numa_fill_memblks +static inline int __init numa_fill_memblks(u64 start, u64 end) +{ + return NUMA_NO_MEMBLK; +} +#endif #else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) {