diff mbox series

[RFC,v4,3/7] mm/demotion: Build demotion targets based on explicit memory tiers

Message ID 20220527122528.129445-4-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V May 27, 2022, 12:25 p.m. UTC
From: Jagdish Gediya <jvgediya@linux.ibm.com>

This patch switch the demotion target building logic to use memory tiers
instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the
default tier 1 and additional memory tiers will be added by drivers like
dax kmem.

This patch builds the demotion target for a NUMA node by looking at all
memory tiers below the tier to which the NUMA node belongs. The closest node
in the immediately following memory tier is used as a demotion target.

Since we are now only building demotion target for N_MEMORY NUMA nodes
the CPU hotplug calls are removed in this patch.

Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/migrate.h |   8 -
 mm/migrate.c            | 460 +++++++++++++++-------------------------
 mm/vmstat.c             |   5 -
 3 files changed, 172 insertions(+), 301 deletions(-)

Comments

Jonathan Cameron May 27, 2022, 2:31 p.m. UTC | #1
On Fri, 27 May 2022 17:55:24 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:

> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> 
> This patch switch the demotion target building logic to use memory tiers
> instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the
> default tier 1 and additional memory tiers will be added by drivers like
> dax kmem.
> 
> This patch builds the demotion target for a NUMA node by looking at all
> memory tiers below the tier to which the NUMA node belongs. The closest node
> in the immediately following memory tier is used as a demotion target.
> 
> Since we are now only building demotion target for N_MEMORY NUMA nodes
> the CPU hotplug calls are removed in this patch.
> 
> Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Hi

Diff made a mess of this one!

Anyhow, a few comments inline.

Thanks,

Jonathan


> --- a/mm/migrate.c
> +++ b/mm/migrate.c

> +/*
> + * node_demotion[] examples:

Perhaps call out these are examples of possible default situations.
None are enforced by this code.

> + *
> + * Example 1:
> + *
> + * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
> + *
> + * node distances:
> + * node   0    1    2    3
> + *    0  10   20   30   40
> + *    1  20   10   40   30
> + *    2  30   40   10   40
> + *    3  40   30   40   10
> + *
> + * memory_tiers[0] = <empty>
> + * memory_tiers[1] = 0-1
> + * memory_tiers[2] = 2-3
> + *
> + * node_demotion[0].preferred = 2
> + * node_demotion[1].preferred = 3
> + * node_demotion[2].preferred = <empty>
> + * node_demotion[3].preferred = <empty>
> + *
> + * Example 2:
> + *
> + * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
> + *
> + * node distances:
> + * node   0    1    2
> + *    0  10   20   30
> + *    1  20   10   30
> + *    2  30   30   10
> + *
> + * memory_tiers[0] = <empty>
> + * memory_tiers[1] = 0-2
> + * memory_tiers[2] = <empty>
> + *
> + * node_demotion[0].preferred = <empty>
> + * node_demotion[1].preferred = <empty>
> + * node_demotion[2].preferred = <empty>
> + *
> + * Example 3:
> + *
> + * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
> + *
> + * node distances:
> + * node   0    1    2
> + *    0  10   20   30
> + *    1  20   10   40
> + *    2  30   40   10
> + *
> + * memory_tiers[0] = 1
> + * memory_tiers[1] = 0
> + * memory_tiers[2] = 2
> + *
> + * node_demotion[0].preferred = 2
> + * node_demotion[1].preferred = 0
> + * node_demotion[2].preferred = <empty>
> + *
> + */



>  /* Disable reclaim-based migration. */
>  static void __disable_all_migrate_targets(void)
>  {

> +	int node;
>  
> +	for_each_node_mask(node, node_states[N_MEMORY])
> +		node_demotion[node].preferred = NODE_MASK_NONE;
>  }

>  /*
			    int best_distance)
> +* Find an automatic demotion target for all memory
> +* nodes. Failing here is OK.  It might just indicate
> +* being at the end of a chain.
> +*/
> +static void establish_migration_targets(void)
>  {
Diff did a horrible job on this, so I've reformatted heavily
so could see what was happening!

>  	struct demotion_nodes *nd;
> +	int tier, target = NUMA_NO_NODE, node;
> +	int distance, best_distance;
> +	nodemask_t used;
>  
>  	if (!node_demotion)
> +		return;
>  
> +	disable_all_migrate_targets();
> +	for_each_node_mask(node, node_states[N_MEMORY]) {
> +		best_distance = -1;
> +		nd = &node_demotion[node];
>  
> +		tier = __node_get_memory_tier(node);
> +		/*
> +		 * Find next tier to demote.

in discussion of Wei Xu's RFC we concluded that we need
to allow demotion to nearest node in 'any' higher tier
(now bigger rank).  That functionality matters for even
moderately complex systems.

> +		 */
> +		while (++tier < MAX_MEMORY_TIERS) {
> +			if (memory_tiers[tier])
> +				break;
> +		}

> +		if (tier >= MAX_MEMORY_TIERS)
> +			continue;
>  
> +		nodes_andnot(used, node_states[N_MEMORY], memory_tiers[tier]->nodelist); 

I'm a bit lost on this one.  Perhaps a comment to say what 'used' represents?
I was expecting all memory nodes in tiers with rank > current tier. I'm not sure that's what
we have here.

>  
>  		/*
> +		 * Find all the nodes in the memory tier node list of same best distance.
> +		 * add add them to the preferred mask. We randomly select between nodes

repeated add.

> +		 * in the preferred mask when allocating pages during demotion.
>  		 */
>  		do {
> +			target = find_next_best_node(node, &used);
> +			if (target == NUMA_NO_NODE)
>  				break;
>  
> +			distance = node_distance(node, target);
> +			if (distance == best_distance || best_distance == -1) {
> +				best_distance = distance;
> +				node_set(target, nd->preferred);
> +			} else {
> +				break;
> +			}
>  		} while (1);
>  	}

>  }
>
kernel test robot May 30, 2022, 3:35 a.m. UTC | #2
Greeting,

FYI, we noticed the following commit (built with gcc-11):

commit: 8ebccd60c2db6beefef2f39b05a95024be0c39eb ("[RFC PATCH v4 3/7] mm/demotion: Build demotion targets based on explicit memory tiers")
url: https://github.com/intel-lab-lkp/linux/commits/Aneesh-Kumar-K-V/mm-demotion-Add-support-for-explicit-memory-tiers/20220527-212536
base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/driver-core.git b232b02bf3c205b13a26dcec08e53baddd8e59ed
patch link: https://lore.kernel.org/linux-mm/20220527122528.129445-4-aneesh.kumar@linux.ibm.com

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <oliver.sang@intel.com>


[    2.576581][    T1] debug_vm_pgtable: [debug_vm_pgtable         ]: Validating architecture page table helpers
[    2.584367][    T1] BUG: sleeping function called from invalid context at mm/compaction.c:540
[    2.585275][    T1] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
[    2.586166][    T1] preempt_count: 1, expected: 0
[    2.586668][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc5-00059-g8ebccd60c2db #1
[    2.587562][    T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
[    2.588577][    T1] Call Trace:
[    2.588948][    T1]  <TASK>
[    2.589284][    T1]  dump_stack_lvl+0x34/0x44
[    2.589765][    T1]  __might_resched+0x134/0x149
[    2.590253][    T1]  isolate_freepages_block+0xe6/0x2d3
[    2.590794][    T1]  isolate_freepages_range+0xc5/0x118
[    2.591342][    T1]  alloc_contig_range+0x2dd/0x350
[    2.591858][    T1]  ? alloc_contig_pages+0x170/0x194
[    2.592384][    T1]  alloc_contig_pages+0x170/0x194
[    2.592896][    T1]  init_args+0x3d0/0x44e
[    2.593345][    T1]  ? init_args+0x44e/0x44e
[    2.593816][    T1]  debug_vm_pgtable+0x46/0x809
[    2.594312][    T1]  ? alloc_inode+0x37/0x8e
[    2.594774][    T1]  ? init_args+0x44e/0x44e
[    2.595235][    T1]  do_one_initcall+0x83/0x187
[    2.595729][    T1]  do_initcalls+0xc6/0xdf
[    2.596190][    T1]  kernel_init_freeable+0x10d/0x13c
[    2.596721][    T1]  ? rest_init+0xcd/0xcd
[    2.597170][    T1]  kernel_init+0x16/0x11a
[    2.597636][    T1]  ret_from_fork+0x22/0x30
[    2.598097][    T1]  </TASK>
[    2.626547][    T1] ------------[ cut here ]------------
[    2.627157][    T1] initcall debug_vm_pgtable+0x0/0x809 returned with preemption imbalance
[    2.628019][    T1] WARNING: CPU: 0 PID: 1 at init/main.c:1311 do_one_initcall+0x140/0x187
[    2.628863][    T1] Modules linked in:
[    2.629280][    T1] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W         5.18.0-rc5-00059-g8ebccd60c2db #1
[    2.630295][    T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
[    2.631306][    T1] RIP: 0010:do_one_initcall+0x140/0x187
[    2.631867][    T1] Code: 00 00 48 c7 c6 ca b6 2c 82 48 89 e7 e8 80 ca 44 00 fb 80 3c 24 00 74 14 48 89 e2 48 89 ee 48 c7 c7 df b6 2c 82 e8 b3 d6 a2 00 <0f> 0b 48 8b 44 24 40 65 48 2b 04 25 28 00 00 00 74 05 e8 d8 cd a4
[    2.633713][    T1] RSP: 0000:ffffc90000013ea8 EFLAGS: 00010286
[    2.634312][    T1] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000003
[    2.635123][    T1] RDX: 0000000000000216 RSI: 0000000000000001 RDI: 0000000000000001
[    2.635932][    T1] RBP: ffffffff82f3b694 R08: 0000000000000000 R09: 0000000000000019
[    2.636735][    T1] R10: 0000000000000000 R11: 0000000074696e69 R12: 0000000000000000
[    2.637538][    T1] R13: ffff88810cba0000 R14: 0000000000000000 R15: 0000000000000000
[    2.638353][    T1] FS:  0000000000000000(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000
[    2.639253][    T1] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.639901][    T1] CR2: ffff88843ffff000 CR3: 0000000002612000 CR4: 00000000000406f0
[    2.640711][    T1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    2.641526][    T1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    2.642341][    T1] Call Trace:
[    2.642707][    T1]  <TASK>
[    2.643051][    T1]  do_initcalls+0xc6/0xdf
[    2.643512][    T1]  kernel_init_freeable+0x10d/0x13c
[    2.644045][    T1]  ? rest_init+0xcd/0xcd
[    2.644498][    T1]  kernel_init+0x16/0x11a
[    2.644956][    T1]  ret_from_fork+0x22/0x30
[    2.645417][    T1]  </TASK>
[    2.645764][    T1] ---[ end trace 0000000000000000 ]---



To reproduce:

        # build kernel
	cd linux
	cp config-5.18.0-rc5-00059-g8ebccd60c2db .config
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index d37d1d5dee82..cbef71a499c1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -177,12 +177,6 @@  enum memory_tier_type {
 };
 
 int next_demotion_node(int node);
-extern void migrate_on_reclaim_init(void);
-#ifdef CONFIG_HOTPLUG_CPU
-extern void set_migration_target_nodes(void);
-#else
-static inline void set_migration_target_nodes(void) {}
-#endif
 int node_get_memory_tier(int node);
 int node_set_memory_tier(int node, int tier);
 int node_reset_memory_tier(int node, int tier);
@@ -193,8 +187,6 @@  static inline int next_demotion_node(int node)
 	return NUMA_NO_NODE;
 }
 
-static inline void set_migration_target_nodes(void) {}
-static inline void migrate_on_reclaim_init(void) {}
 #endif	/* CONFIG_TIERED_MEMORY */
 
 #endif /* _LINUX_MIGRATE_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index 304559ba3372..d819a64db5b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2125,6 +2125,10 @@  struct memory_tier {
 	nodemask_t nodelist;
 };
 
+struct demotion_nodes {
+	nodemask_t preferred;
+};
+
 #define to_memory_tier(device) container_of(device, struct memory_tier, dev)
 
 static struct bus_type memory_tier_subsys = {
@@ -2132,9 +2136,73 @@  static struct bus_type memory_tier_subsys = {
 	.dev_name = "memtier",
 };
 
+static void establish_migration_targets(void);
+
 DEFINE_MUTEX(memory_tier_lock);
 static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
 
+/*
+ * node_demotion[] examples:
+ *
+ * Example 1:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
+ *
+ * node distances:
+ * node   0    1    2    3
+ *    0  10   20   30   40
+ *    1  20   10   40   30
+ *    2  30   40   10   40
+ *    3  40   30   40   10
+ *
+ * memory_tiers[0] = <empty>
+ * memory_tiers[1] = 0-1
+ * memory_tiers[2] = 2-3
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 3
+ * node_demotion[2].preferred = <empty>
+ * node_demotion[3].preferred = <empty>
+ *
+ * Example 2:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
+ *
+ * node distances:
+ * node   0    1    2
+ *    0  10   20   30
+ *    1  20   10   30
+ *    2  30   30   10
+ *
+ * memory_tiers[0] = <empty>
+ * memory_tiers[1] = 0-2
+ * memory_tiers[2] = <empty>
+ *
+ * node_demotion[0].preferred = <empty>
+ * node_demotion[1].preferred = <empty>
+ * node_demotion[2].preferred = <empty>
+ *
+ * Example 3:
+ *
+ * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
+ *
+ * node distances:
+ * node   0    1    2
+ *    0  10   20   30
+ *    1  20   10   40
+ *    2  30   40   10
+ *
+ * memory_tiers[0] = 1
+ * memory_tiers[1] = 0
+ * memory_tiers[2] = 2
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 0
+ * node_demotion[2].preferred = <empty>
+ *
+ */
+static struct demotion_nodes *node_demotion __read_mostly;
+
 static ssize_t nodelist_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
@@ -2238,6 +2306,28 @@  static int __node_get_memory_tier(int node)
 	return -1;
 }
 
+static void node_remove_from_memory_tier(int node)
+{
+	int tier;
+
+	mutex_lock(&memory_tier_lock);
+
+	tier = __node_get_memory_tier(node);
+
+	/*
+	 * Remove node from tier, if tier becomes
+	 * empty then unregister it to make it invisible
+	 * in sysfs.
+	 */
+	node_clear(node, memory_tiers[tier]->nodelist);
+	if (nodes_empty(memory_tiers[tier]->nodelist))
+		unregister_memory_tier(tier);
+
+	establish_migration_targets();
+
+	mutex_unlock(&memory_tier_lock);
+}
+
 int node_get_memory_tier(int node)
 {
 	int tier;
@@ -2271,6 +2361,7 @@  int __node_set_memory_tier(int node, int tier)
 	}
 
 	node_set(node, memory_tiers[tier]->nodelist);
+	establish_migration_targets();
 
 out:
 	return ret;
@@ -2328,75 +2419,6 @@  int node_set_memory_tier(int node, int tier)
 	return ret;
 }
 
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets.  Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node.  The
- * CPUs are placed in the node with the "fast" memory.  The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- *	Socket A: 0, 1, 2
- *	Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1.  When Node 1 fills up, it should be migrated to
- * Node 2.  The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- *	0 -> 1 -> 2 -> stop
- *	3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
- *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
- *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
- *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
- *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
- *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
- *
- * Moreover some systems may have multiple slow memory nodes.
- * Suppose a system has one socket with 3 memory nodes, node 0
- * is fast memory type, and node 1/2 both are slow memory
- * type, and the distance between fast memory node and slow
- * memory node is same. So the migration path should be:
- *
- *	0 -> 1/2 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
- *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
- *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
- */
-
-/*
- * Writes to this array occur without locking.  Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-#define DEFAULT_DEMOTION_TARGET_NODES 15
-
-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
-#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
-#else
-#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
-#endif
-
-struct demotion_nodes {
-	unsigned short nr;
-	short nodes[DEMOTION_TARGET_NODES];
-};
-
-static struct demotion_nodes *node_demotion __read_mostly;
-
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node
@@ -2409,8 +2431,7 @@  static struct demotion_nodes *node_demotion __read_mostly;
 int next_demotion_node(int node)
 {
 	struct demotion_nodes *nd;
-	unsigned short target_nr, index;
-	int target;
+	int target, nnodes, i;
 
 	if (!node_demotion)
 		return NUMA_NO_NODE;
@@ -2419,61 +2440,46 @@  int next_demotion_node(int node)
 
 	/*
 	 * node_demotion[] is updated without excluding this
-	 * function from running.  RCU doesn't provide any
-	 * compiler barriers, so the READ_ONCE() is required
-	 * to avoid compiler reordering or read merging.
+	 * function from running.
 	 *
 	 * Make sure to use RCU over entire code blocks if
 	 * node_demotion[] reads need to be consistent.
 	 */
 	rcu_read_lock();
-	target_nr = READ_ONCE(nd->nr);
 
-	switch (target_nr) {
-	case 0:
-		target = NUMA_NO_NODE;
-		goto out;
-	case 1:
-		index = 0;
-		break;
-	default:
-		/*
-		 * If there are multiple target nodes, just select one
-		 * target node randomly.
-		 *
-		 * In addition, we can also use round-robin to select
-		 * target node, but we should introduce another variable
-		 * for node_demotion[] to record last selected target node,
-		 * that may cause cache ping-pong due to the changing of
-		 * last target node. Or introducing per-cpu data to avoid
-		 * caching issue, which seems more complicated. So selecting
-		 * target node randomly seems better until now.
-		 */
-		index = get_random_int() % target_nr;
-		break;
-	}
+	nnodes = nodes_weight(nd->preferred);
+	if (!nnodes)
+		return NUMA_NO_NODE;
 
-	target = READ_ONCE(nd->nodes[index]);
+	/*
+	 * If there are multiple target nodes, just select one
+	 * target node randomly.
+	 *
+	 * In addition, we can also use round-robin to select
+	 * target node, but we should introduce another variable
+	 * for node_demotion[] to record last selected target node,
+	 * that may cause cache ping-pong due to the changing of
+	 * last target node. Or introducing per-cpu data to avoid
+	 * caching issue, which seems more complicated. So selecting
+	 * target node randomly seems better until now.
+	 */
+	nnodes = get_random_int() % nnodes;
+	target = first_node(nd->preferred);
+	for (i = 0; i < nnodes; i++)
+		target = next_node(target, nd->preferred);
 
-out:
 	rcu_read_unlock();
+
 	return target;
 }
 
-#if defined(CONFIG_HOTPLUG_CPU)
 /* Disable reclaim-based migration. */
 static void __disable_all_migrate_targets(void)
 {
-	int node, i;
+	int node;
 
-	if (!node_demotion)
-		return;
-
-	for_each_online_node(node) {
-		node_demotion[node].nr = 0;
-		for (i = 0; i < DEMOTION_TARGET_NODES; i++)
-			node_demotion[node].nodes[i] = NUMA_NO_NODE;
-	}
+	for_each_node_mask(node, node_states[N_MEMORY])
+		node_demotion[node].preferred = NODE_MASK_NONE;
 }
 
 static void disable_all_migrate_targets(void)
@@ -2485,173 +2491,70 @@  static void disable_all_migrate_targets(void)
 	 * Readers will see either a combination of before+disable
 	 * state or disable+after.  They will never see before and
 	 * after state together.
-	 *
-	 * The before+after state together might have cycles and
-	 * could cause readers to do things like loop until this
-	 * function finishes.  This ensures they can only see a
-	 * single "bad" read and would, for instance, only loop
-	 * once.
 	 */
 	synchronize_rcu();
 }
 
 /*
- * Find an automatic demotion target for 'node'.
- * Failing here is OK.  It might just indicate
- * being at the end of a chain.
- */
-static int establish_migrate_target(int node, nodemask_t *used,
-				    int best_distance)
+* Find an automatic demotion target for all memory
+* nodes. Failing here is OK.  It might just indicate
+* being at the end of a chain.
+*/
+static void establish_migration_targets(void)
 {
-	int migration_target, index, val;
 	struct demotion_nodes *nd;
+	int tier, target = NUMA_NO_NODE, node;
+	int distance, best_distance;
+	nodemask_t used;
 
 	if (!node_demotion)
-		return NUMA_NO_NODE;
-
-	nd = &node_demotion[node];
-
-	migration_target = find_next_best_node(node, used);
-	if (migration_target == NUMA_NO_NODE)
-		return NUMA_NO_NODE;
-
-	/*
-	 * If the node has been set a migration target node before,
-	 * which means it's the best distance between them. Still
-	 * check if this node can be demoted to other target nodes
-	 * if they have a same best distance.
-	 */
-	if (best_distance != -1) {
-		val = node_distance(node, migration_target);
-		if (val > best_distance)
-			goto out_clear;
-	}
-
-	index = nd->nr;
-	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
-		      "Exceeds maximum demotion target nodes\n"))
-		goto out_clear;
-
-	nd->nodes[index] = migration_target;
-	nd->nr++;
+		return;
 
-	return migration_target;
-out_clear:
-	node_clear(migration_target, *used);
-	return NUMA_NO_NODE;
-}
+	disable_all_migrate_targets();
 
-/*
- * When memory fills up on a node, memory contents can be
- * automatically migrated to another node instead of
- * discarded at reclaim.
- *
- * Establish a "migration path" which will start at nodes
- * with CPUs and will follow the priorities used to build the
- * page allocator zonelists.
- *
- * The difference here is that cycles must be avoided.  If
- * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0. Also one node can
- * be migrated to multiple nodes if the target nodes all have
- * a same best-distance against the source node.
- *
- * This function can run simultaneously with readers of
- * node_demotion[].  However, it can not run simultaneously
- * with itself.  Exclusion is provided by memory hotplug events
- * being single-threaded.
- */
-static void __set_migration_target_nodes(void)
-{
-	nodemask_t next_pass	= NODE_MASK_NONE;
-	nodemask_t this_pass	= NODE_MASK_NONE;
-	nodemask_t used_targets = NODE_MASK_NONE;
-	int node, best_distance;
+	for_each_node_mask(node, node_states[N_MEMORY]) {
+		best_distance = -1;
+		nd = &node_demotion[node];
 
-	/*
-	 * Avoid any oddities like cycles that could occur
-	 * from changes in the topology.  This will leave
-	 * a momentary gap when migration is disabled.
-	 */
-	disable_all_migrate_targets();
+		tier = __node_get_memory_tier(node);
+		/*
+		 * Find next tier to demote.
+		 */
+		while (++tier < MAX_MEMORY_TIERS) {
+			if (memory_tiers[tier])
+				break;
+		}
 
-	/*
-	 * Allocations go close to CPUs, first.  Assume that
-	 * the migration path starts at the nodes with CPUs.
-	 */
-	next_pass = node_states[N_CPU];
-again:
-	this_pass = next_pass;
-	next_pass = NODE_MASK_NONE;
-	/*
-	 * To avoid cycles in the migration "graph", ensure
-	 * that migration sources are not future targets by
-	 * setting them in 'used_targets'.  Do this only
-	 * once per pass so that multiple source nodes can
-	 * share a target node.
-	 *
-	 * 'used_targets' will become unavailable in future
-	 * passes.  This limits some opportunities for
-	 * multiple source nodes to share a destination.
-	 */
-	nodes_or(used_targets, used_targets, this_pass);
+		if (tier >= MAX_MEMORY_TIERS)
+			continue;
 
-	for_each_node_mask(node, this_pass) {
-		best_distance = -1;
+		nodes_andnot(used, node_states[N_MEMORY], memory_tiers[tier]->nodelist);
 
 		/*
-		 * Try to set up the migration path for the node, and the target
-		 * migration nodes can be multiple, so doing a loop to find all
-		 * the target nodes if they all have a best node distance.
+		 * Find all the nodes in the memory tier node list of same best distance.
+		 * add add them to the preferred mask. We randomly select between nodes
+		 * in the preferred mask when allocating pages during demotion.
 		 */
 		do {
-			int target_node =
-				establish_migrate_target(node, &used_targets,
-							 best_distance);
-
-			if (target_node == NUMA_NO_NODE)
+			target = find_next_best_node(node, &used);
+			if (target == NUMA_NO_NODE)
 				break;
 
-			if (best_distance == -1)
-				best_distance = node_distance(node, target_node);
-
-			/*
-			 * Visit targets from this pass in the next pass.
-			 * Eventually, every node will have been part of
-			 * a pass, and will become set in 'used_targets'.
-			 */
-			node_set(target_node, next_pass);
+			distance = node_distance(node, target);
+			if (distance == best_distance || best_distance == -1) {
+				best_distance = distance;
+				node_set(target, nd->preferred);
+			} else {
+				break;
+			}
 		} while (1);
 	}
-	/*
-	 * 'next_pass' contains nodes which became migration
-	 * targets in this pass.  Make additional passes until
-	 * no more migrations targets are available.
-	 */
-	if (!nodes_empty(next_pass))
-		goto again;
 }
 
 /*
- * For callers that do not hold get_online_mems() already.
- */
-void set_migration_target_nodes(void)
-{
-	get_online_mems();
-	__set_migration_target_nodes();
-	put_online_mems();
-}
-
-/*
- * This leaves migrate-on-reclaim transiently disabled between
- * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
- * whether reclaim-based migration is enabled or not, which
- * ensures that the user can turn reclaim-based migration at
- * any time without needing to recalculate migration targets.
- *
- * These callbacks already hold get_online_mems().  That is why
- * __set_migration_target_nodes() can be used as opposed to
- * set_migration_target_nodes().
+ * This runs whether reclaim-based migration is enabled or not,
+ * which ensures that the user can turn reclaim-based migration
+ * at any time without needing to recalculate migration targets.
  */
 static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
 						 unsigned long action, void *_arg)
@@ -2660,64 +2563,44 @@  static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
 
 	/*
 	 * Only update the node migration order when a node is
-	 * changing status, like online->offline.  This avoids
-	 * the overhead of synchronize_rcu() in most cases.
+	 * changing status, like online->offline.
 	 */
 	if (arg->status_change_nid < 0)
 		return notifier_from_errno(0);
 
 	switch (action) {
-	case MEM_GOING_OFFLINE:
-		/*
-		 * Make sure there are not transient states where
-		 * an offline node is a migration target.  This
-		 * will leave migration disabled until the offline
-		 * completes and the MEM_OFFLINE case below runs.
-		 */
-		disable_all_migrate_targets();
-		break;
 	case MEM_OFFLINE:
-	case MEM_ONLINE:
 		/*
-		 * Recalculate the target nodes once the node
-		 * reaches its final state (online or offline).
+		 * In case we are moving out of N_MEMORY. Keep the node
+		 * in the memory tier so that when we bring memory online,
+		 * they appear in the right memory tier. We still need
+		 * to rebuild the demotion order.
 		 */
-		__set_migration_target_nodes();
+		mutex_lock(&memory_tier_lock);
+		establish_migration_targets();
+		mutex_unlock(&memory_tier_lock);
 		break;
-	case MEM_CANCEL_OFFLINE:
+	case MEM_ONLINE:
 		/*
-		 * MEM_GOING_OFFLINE disabled all the migration
-		 * targets.  Reenable them.
+		 * We ignore the error here, if the node already have the tier
+		 * registered, we will continue to use that for the new memory
+		 * we are adding here.
 		 */
-		__set_migration_target_nodes();
-		break;
-	case MEM_GOING_ONLINE:
-	case MEM_CANCEL_ONLINE:
+		node_set_memory_tier(arg->status_change_nid, DEFAULT_MEMORY_TIER);
 		break;
 	}
 
 	return notifier_from_errno(0);
 }
 
-void __init migrate_on_reclaim_init(void)
+static void __init migrate_on_reclaim_init(void)
 {
-	node_demotion = kmalloc_array(nr_node_ids,
-				      sizeof(struct demotion_nodes),
-				      GFP_KERNEL);
+	node_demotion = kcalloc(MAX_NUMNODES, sizeof(struct demotion_nodes),
+				GFP_KERNEL);
 	WARN_ON(!node_demotion);
 
 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-	/*
-	 * At this point, all numa nodes with memory/CPus have their state
-	 * properly set, so we can build the demotion order now.
-	 * Let us hold the cpu_hotplug lock just, as we could possibily have
-	 * CPU hotplug events during boot.
-	 */
-	cpus_read_lock();
-	set_migration_target_nodes();
-	cpus_read_unlock();
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 bool numa_demotion_enabled = false;
 
@@ -2800,6 +2683,7 @@  static int __init memory_tier_init(void)
 	 * CPU only nodes are not part of memoty tiers.
 	 */
 	memory_tiers[DEFAULT_MEMORY_TIER]->nodelist = node_states[N_MEMORY];
+	migrate_on_reclaim_init();
 
 	return 0;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b75b1a64b54c..7815d21345a4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2053,7 +2053,6 @@  static int vmstat_cpu_online(unsigned int cpu)
 
 	if (!node_state(cpu_to_node(cpu), N_CPU)) {
 		node_set_state(cpu_to_node(cpu), N_CPU);
-		set_migration_target_nodes();
 	}
 
 	return 0;
@@ -2078,7 +2077,6 @@  static int vmstat_cpu_dead(unsigned int cpu)
 		return 0;
 
 	node_clear_state(node, N_CPU);
-	set_migration_target_nodes();
 
 	return 0;
 }
@@ -2111,9 +2109,6 @@  void __init init_mm_internals(void)
 
 	start_shepherd_timer();
 #endif
-#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
-	migrate_on_reclaim_init();
-#endif
 #ifdef CONFIG_PROC_FS
 	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
 	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);