@@ -81,6 +81,92 @@ int acpi_map_pxm_to_node(int pxm)
}
EXPORT_SYMBOL(acpi_map_pxm_to_node);
+#ifdef CONFIG_NUMA_EMU
+/*
+ * Take max_nid - 1 fake-numa nodes into account in both
+ * pxm_to_node_map()/node_to_pxm_map[] tables.
+ */
+int __init fix_pxm_node_maps(int max_nid)
+{
+ static int pxm_to_node_map_copy[MAX_PXM_DOMAINS] __initdata
+ = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE };
+ static int node_to_pxm_map_copy[MAX_NUMNODES] __initdata
+ = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
+ int i, j, index = -1, count = 0;
+ nodemask_t nodes_to_enable;
+
+ if (numa_off || srat_disabled())
+ return -1;
+
+ /* find fake nodes PXM mapping */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (node_to_pxm_map[i] != PXM_INVAL) {
+ for (j = 0; j <= max_nid; j++) {
+ if ((emu_nid_to_phys[j] == i) &&
+ WARN(node_to_pxm_map_copy[j] != PXM_INVAL,
+ "Node %d is already binded to PXM %d\n",
+ j, node_to_pxm_map_copy[j]))
+ return -1;
+ if (emu_nid_to_phys[j] == i) {
+ node_to_pxm_map_copy[j] =
+ node_to_pxm_map[i];
+ if (j > index)
+ index = j;
+ count++;
+ }
+ }
+ }
+ }
+ if (WARN(index != max_nid, "%d max nid when expected %d\n",
+ index, max_nid))
+ return -1;
+
+ nodes_clear(nodes_to_enable);
+
+ /* map phys nodes not used for fake nodes */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (node_to_pxm_map[i] != PXM_INVAL) {
+ for (j = 0; j <= max_nid; j++)
+ if (emu_nid_to_phys[j] == i)
+ break;
+ /* fake nodes PXM mapping has been done */
+ if (j <= max_nid)
+ continue;
+ /* find first hole */
+ for (j = 0;
+ j < MAX_NUMNODES &&
+ node_to_pxm_map_copy[j] != PXM_INVAL;
+ j++)
+ ;
+ if (WARN(j == MAX_NUMNODES,
+ "Number of nodes exceeds MAX_NUMNODES\n"))
+ return -1;
+ node_to_pxm_map_copy[j] = node_to_pxm_map[i];
+ node_set(j, nodes_to_enable);
+ count++;
+ }
+ }
+
+ /* creating reverse mapping in pxm_to_node_map[] */
+ for (i = 0; i < MAX_NUMNODES; i++)
+ if (node_to_pxm_map_copy[i] != PXM_INVAL &&
+ pxm_to_node_map_copy[node_to_pxm_map_copy[i]] == NUMA_NO_NODE)
+ pxm_to_node_map_copy[node_to_pxm_map_copy[i]] = i;
+
+ /* overwrite with new mapping */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ node_to_pxm_map[i] = node_to_pxm_map_copy[i];
+ pxm_to_node_map[i] = pxm_to_node_map_copy[i];
+ }
+
+ /* enable other nodes found in PXM for hotplug */
+ nodes_or(numa_nodes_parsed, nodes_to_enable, numa_nodes_parsed);
+
+ pr_debug("found %d total number of nodes\n", count);
+ return 0;
+}
+#endif
+
static void __init
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
{
@@ -17,11 +17,16 @@ extern int node_to_pxm(int);
extern int acpi_map_pxm_to_node(int);
extern unsigned char acpi_srat_revision;
extern void disable_srat(void);
+extern int fix_pxm_node_maps(int max_nid);
extern void bad_srat(void);
extern int srat_disabled(void);
#else /* CONFIG_ACPI_NUMA */
+static inline int fix_pxm_node_maps(int max_nid)
+{
+ return 0;
+}
static inline void disable_srat(void)
{
}
@@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
int __init numa_memblks_init(int (*init_func)(void),
bool memblock_force_top_down);
+extern int numa_distance_cnt;
+
#ifdef CONFIG_NUMA_EMU
+extern int emu_nid_to_phys[MAX_NUMNODES];
int numa_emu_cmdline(char *str);
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
unsigned int nr_emu_nids);
@@ -8,11 +8,12 @@
#include <linux/memblock.h>
#include <linux/numa_memblks.h>
#include <asm/numa.h>
+#include <acpi/acpi_numa.h>
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-static int emu_nid_to_phys[MAX_NUMNODES];
+int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
int __init numa_emu_cmdline(char *str)
@@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
int i, j, ret;
+ nodemask_t physnode_mask = numa_nodes_parsed;
if (!emu_cmdline)
goto no_emu;
@@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* split the system RAM into N fake nodes.
*/
if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
unsigned long n;
int nid = 0;
@@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
*/
max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
- /* commit */
- *numa_meminfo = ei;
-
/* Make sure numa_nodes_parsed only contains emulated nodes */
nodes_clear(numa_nodes_parsed);
for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
@@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
ei.blk[i].nid != NUMA_NO_NODE)
node_set(ei.blk[i].nid, numa_nodes_parsed);
- numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
+ /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision
+ * with faked numa nodes, particularly during later memory hotplug
+ * handling, and also update numa_nodes_parsed accordingly.
+ */
+ ret = fix_pxm_node_maps(max_emu_nid);
+ if (ret < 0)
+ goto no_emu;
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1);
/* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ for (i = 0; i < max_emu_nid + 1; i++)
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
emu_nid_to_phys[i] = dfl_phys_nid;
@@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
numa_set_distance(i, j, dist);
}
}
+ for (i = 0; i < numa_distance_cnt; i++) {
+ for (j = 0; j < numa_distance_cnt; j++) {
+ int physi, physj;
+ u8 dist;
+
+ /* distance between fake nodes is already ok */
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE &&
+ emu_nid_to_phys[j] != NUMA_NO_NODE)
+ continue;
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE)
+ physi = emu_nid_to_phys[i];
+ else
+ physi = i - max_emu_nid;
+ if (emu_nid_to_phys[j] != NUMA_NO_NODE)
+ physj = emu_nid_to_phys[j];
+ else
+ physj = j - max_emu_nid;
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+ numa_set_distance(i, j, dist);
+ }
+ }
/* free the copied physical distance table */
memblock_free(phys_dist, phys_size);
return;
no_emu:
+ numa_nodes_parsed = physnode_mask;
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
emu_nid_to_phys[i] = i;
@@ -7,7 +7,7 @@
#include <linux/numa.h>
#include <linux/numa_memblks.h>
-static int numa_distance_cnt;
+int numa_distance_cnt;
static u8 *numa_distance;
nodemask_t numa_nodes_parsed __initdata;
Current fake-numa implementation prevents new Numa nodes to be later hot-plugged by drivers. A common symptom of this limitation is the "node <X> was absent from the node_possible_map" message by associated warning in mm/memory_hotplug.c: add_memory_resource(). This comes from the lack of remapping in both pxm_to_node_map[] and node_to_pxm_map[] tables to take fake-numa nodes into account and thus triggers collisions with original and physical nodes only-mapping that had been determined from BIOS tables. This patch fixes this by doing the necessary node-ids translation in both pxm_to_node_map[]/node_to_pxm_map[] tables. node_distance[] table has also been fixed accordingly. Signed-off-by: Bruno Faccini <bfaccini@nvidia.com> --- drivers/acpi/numa/srat.c | 86 ++++++++++++++++++++++++++++++++++++ include/acpi/acpi_numa.h | 5 +++ include/linux/numa_memblks.h | 3 ++ mm/numa_emulation.c | 45 ++++++++++++++++--- mm/numa_memblks.c | 2 +- 5 files changed, 133 insertions(+), 8 deletions(-)