@@ -939,6 +939,14 @@ static inline int is_node_dram(int nid)
return test_bit(PGDAT_DRAM, &pgdat->flags);
}
+static inline int is_node_same_type(int nida, int nidb)
+{
+ if (node_isset(nida, numa_nodes_pmem))
+ return node_isset(nidb, numa_nodes_pmem);
+ else
+ return node_isset(nidb, numa_nodes_dram);
+}
+
static inline void set_node_type(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
@@ -5372,7 +5372,7 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask, int need_same_type)
{
int n, val;
int min_val = INT_MAX;
@@ -5380,7 +5380,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
+ if (need_same_type && !node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
@@ -5391,6 +5391,12 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
if (node_isset(n, *used_node_mask))
continue;
+ if (need_same_type && !is_node_same_type(node, n))
+ continue;
+
+ if (!need_same_type && is_node_same_type(node, n))
+ continue;
+
/* Use the distance array to find the distance */
val = node_distance(node, n);
@@ -5472,31 +5478,35 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+ int need_same_type;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
- nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
- /*
- * We don't want to pressure a particular node.
- * So adding penalty to the first node in same
- * distance group to make it round-robin.
- */
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
- node_load[node] = load;
+ for (need_same_type = 1; need_same_type >= 0; need_same_type--) {
+ nodes_clear(used_mask);
+ while ((node = find_next_best_node(local_node, &used_mask,
+ need_same_type)) >= 0) {
+ /*
+ * We don't want to pressure a particular node.
+ * So adding penalty to the first node in same
+ * distance group to make it round-robin.
+ */
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
+ node_load[node] = load;
- node_order[nr_nodes++] = node;
- prev_node = node;
- load--;
+ node_order[nr_nodes++] = node;
+ prev_node = node;
+ load--;
+ }
}
-
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
On box with both DRAM and PMEM managed by mm system, Usually node 0, 1 are DRAM nodes, nodes 2, 3 are PMEM nodes. nofallback list are same as before, fallback list are not redesigned to be arranged by node type basis, iow, allocation request of DRAM page start from node 0 will go through node0->node1->node2->node3 zonelists. Signed-off-by: Fan Du <fan.du@intel.com> --- include/linux/mmzone.h | 8 ++++++++ mm/page_alloc.c | 42 ++++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 16 deletions(-)