@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
@@ -1171,8 +1172,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
/*
- * Writes to this array occur without locking. READ_ONCE()
- * is recommended for readers to ensure consistent reads.
+ * Writes to this array occur without locking. Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
*/
static int node_demotion[MAX_NUMNODES] __read_mostly =
{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
@@ -1188,13 +1193,22 @@ static int node_demotion[MAX_NUMNODES] __read_mostly =
*/
int next_demotion_node(int node)
{
+ int target;
+
/*
- * node_demotion[] is updated without excluding
- * this function from running. READ_ONCE() avoids
- * reading multiple, inconsistent 'node' values
- * during an update.
+ * node_demotion[] is updated without excluding this
+ * function from running. RCU doesn't provide any
+ * compiler barriers, so the READ_ONCE() is required
+ * to avoid compiler reordering or read merging.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
*/
- return READ_ONCE(node_demotion[node]);
+ rcu_read_lock();
+ target = READ_ONCE(node_demotion[node]);
+ rcu_read_unlock();
+
+ return target;
}
/*
@@ -3189,8 +3203,9 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+#if defined(CONFIG_MEMORY_HOTPLUG)
/* Disable reclaim-based migration. */
-static void disable_all_migrate_targets(void)
+static void __disable_all_migrate_targets(void)
{
int node;
@@ -3198,6 +3213,25 @@ static void disable_all_migrate_targets(void)
node_demotion[node] = NUMA_NO_NODE;
}
+static void disable_all_migrate_targets(void)
+{
+ __disable_all_migrate_targets();
+
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ *
+ * The before+after state together might have cycles and
+ * could cause readers to do things like loop until this
+ * function finishes. This ensures they can only see a
+ * single "bad" read and would, for instance, only loop
+ * once.
+ */
+ synchronize_rcu();
+}
+
/*
* Find an automatic demotion target for 'node'.
* Failing here is OK. It might just indicate
@@ -3259,20 +3293,6 @@ static void __set_migration_target_nodes(void)
*/
disable_all_migrate_targets();
- /*
- * Ensure that the "disable" is visible across the system.
- * Readers will see either a combination of before+disable
- * state or disable+after. They will never see before and
- * after state together.
- *
- * The before+after state together might have cycles and
- * could cause readers to do things like loop until this
- * function finishes. This ensures they can only see a
- * single "bad" read and would, for instance, only loop
- * once.
- */
- smp_wmb();
-
/*
* Allocations go close to CPUs, first. Assume that
* the migration path starts at the nodes with CPUs.
@@ -3310,10 +3330,96 @@ static void __set_migration_target_nodes(void)
/*
* For callers that do not hold get_online_mems() already.
*/
-__maybe_unused // <- temporay to prevent warnings during bisects
static void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
put_online_mems();
}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs. That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems(). That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Make sure there are not transient states where
+ * an offline node is a migration target. This
+ * will leave migration disabled until the offline
+ * completes and the MEM_OFFLINE case below runs.
+ */
+ disable_all_migrate_targets();
+ break;
+ case MEM_OFFLINE:
+ case MEM_ONLINE:
+ /*
+ * Recalculate the target nodes once the node
+ * reaches its final state (online or offline).
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_CANCEL_OFFLINE:
+ /*
+ * MEM_GOING_OFFLINE disabled all the migration
+ * targets. Reenable them.
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+ migration_online_cpu,
+ migration_offline_cpu);
+ /*
+ * In the unlikely case that this fails, the automatic
+ * migration targets may become suboptimal for nodes
+ * where N_CPU changes. With such a small impact in a
+ * rare case, do not bother trying to do anything special.
+ */
+ WARN_ON(ret < 0);
+
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+ return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */