@@ -52,6 +52,73 @@ module_param(memmap_on_memory, bool, 044
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
#endif
+enum {
+ ONLINE_POLICY_CONTIG_ZONES = 0,
+ ONLINE_POLICY_AUTO_MOVABLE,
+};
+
+const char *online_policy_to_str[] = {
+ [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
+ [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
+};
+
+static int set_online_policy(const char *val, const struct kernel_param *kp)
+{
+ int ret = sysfs_match_string(online_policy_to_str, val);
+
+ if (ret < 0)
+ return ret;
+ *((int *)kp->arg) = ret;
+ return 0;
+}
+
+static int get_online_policy(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
+}
+
+/*
+ * memory_hotplug.online_policy: configure online behavior when onlining without
+ * specifying a zone (MMOP_ONLINE)
+ *
+ * "contig-zones": keep zone contiguous
+ * "auto-movable": online memory to ZONE_MOVABLE if the configuration
+ * (auto_movable_ratio, auto_movable_numa_aware) allows for it
+ */
+static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
+static const struct kernel_param_ops online_policy_ops = {
+ .set = set_online_policy,
+ .get = get_online_policy,
+};
+module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
+MODULE_PARM_DESC(online_policy,
+ "Set the online policy (\"contig-zones\", \"auto-movable\") "
+ "Default: \"contig-zones\"");
+
+/*
+ * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
+ *
+ * The ratio represent an upper limit and the kernel might decide to not
+ * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
+ * doesn't allow for more MOVABLE memory.
+ */
+static unsigned int auto_movable_ratio __read_mostly = 301;
+module_param(auto_movable_ratio, uint, 0644);
+MODULE_PARM_DESC(auto_movable_ratio,
+ "Set the maximum ratio of MOVABLE:KERNEL memory in the system "
+ "in percent for \"auto-movable\" online policy. Default: 301");
+
+/*
+ * memory_hotplug.auto_movable_numa_aware: consider numa node stats
+ */
+#ifdef CONFIG_NUMA
+static bool auto_movable_numa_aware __read_mostly = true;
+module_param(auto_movable_numa_aware, bool, 0644);
+MODULE_PARM_DESC(auto_movable_numa_aware,
+ "Consider numa node stats in addition to global stats in "
+ "\"auto-movable\" online policy. Default: true");
+#endif /* CONFIG_NUMA */
+
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -663,6 +730,61 @@ void __ref move_pfn_range_to_zone(struct
set_zone_contiguous(zone);
}
+struct auto_movable_stats {
+ unsigned long kernel_early_pages;
+ unsigned long movable_pages;
+};
+
+static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
+ struct zone *zone)
+{
+ if (zone_idx(zone) == ZONE_MOVABLE) {
+ stats->movable_pages += zone->present_pages;
+ } else {
+ stats->kernel_early_pages += zone->present_early_pages;
+#ifdef CONFIG_CMA
+ /*
+ * CMA pages (never on hotplugged memory) behave like
+ * ZONE_MOVABLE.
+ */
+ stats->movable_pages += zone->cma_pages;
+ stats->kernel_early_pages -= zone->cma_pages;
+#endif /* CONFIG_CMA */
+ }
+}
+
+static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages)
+{
+ struct auto_movable_stats stats = {};
+ unsigned long kernel_early_pages, movable_pages;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ int i;
+
+ /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
+ if (nid == NUMA_NO_NODE) {
+ /* TODO: cache values */
+ for_each_populated_zone(zone)
+ auto_movable_stats_account_zone(&stats, zone);
+ } else {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zone = pgdat->node_zones + i;
+ if (populated_zone(zone))
+ auto_movable_stats_account_zone(&stats, zone);
+ }
+ }
+
+ kernel_early_pages = stats.kernel_early_pages;
+ movable_pages = stats.movable_pages;
+
+ /*
+ * Test if we could online the given number of pages to ZONE_MOVABLE
+ * and still stay in the configured ratio.
+ */
+ movable_pages += nr_pages;
+ return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
+}
+
/*
* Returns a default kernel memory zone for the given pfn range.
* If no kernel zone covers this pfn range it will automatically go
@@ -684,6 +806,72 @@ static struct zone *default_kernel_zone_
return &pgdat->node_zones[ZONE_NORMAL];
}
+/*
+ * Determine to which zone to online memory dynamically based on user
+ * configuration and system stats. We care about the following ratio:
+ *
+ * MOVABLE : KERNEL
+ *
+ * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
+ * one of the kernel zones. CMA pages inside one of the kernel zones really
+ * behaves like ZONE_MOVABLE, so we treat them accordingly.
+ *
+ * We don't allow for hotplugged memory in a KERNEL zone to increase the
+ * amount of MOVABLE memory we can have, so we end up with:
+ *
+ * MOVABLE : KERNEL_EARLY
+ *
+ * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
+ * boot. We base our calculation on KERNEL_EARLY internally, because:
+ *
+ * a) Hotplugged memory in one of the kernel zones can sometimes still get
+ * hotunplugged, especially when hot(un)plugging individual memory blocks.
+ * There is no coordination across memory devices, therefore "automatic"
+ * hotunplugging, as implemented in hypervisors, could result in zone
+ * imbalances.
+ * b) Early/boot memory in one of the kernel zones can usually not get
+ * hotunplugged again (e.g., no firmware interface to unplug, fragmented
+ * with unmovable allocations). While there are corner cases where it might
+ * still work, it is barely relevant in practice.
+ *
+ * We rely on "present pages" instead of "managed pages", as the latter is
+ * highly unreliable and dynamic in virtualized environments, and does not
+ * consider boot time allocations. For example, memory ballooning adjusts the
+ * managed pages when inflating/deflating the balloon, and balloon compaction
+ * can even migrate inflated pages between zones.
+ *
+ * Using "present pages" is better but some things to keep in mind are:
+ *
+ * a) Some memblock allocations, such as for the crashkernel area, are
+ * effectively unused by the kernel, yet they account to "present pages".
+ * Fortunately, these allocations are comparatively small in relevant setups
+ * (e.g., fraction of system memory).
+ * b) Some hotplugged memory blocks in virtualized environments, esecially
+ * hotplugged by virtio-mem, look like they are completely present, however,
+ * only parts of the memory block are actually currently usable.
+ * "present pages" is an upper limit that can get reached at runtime. As
+ * we base our calculations on KERNEL_EARLY, this is not an issue.
+ */
+static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ if (!auto_movable_ratio)
+ goto kernel_zone;
+
+ if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages))
+ goto kernel_zone;
+
+#ifdef CONFIG_NUMA
+ if (auto_movable_numa_aware &&
+ !auto_movable_can_online_movable(nid, nr_pages))
+ goto kernel_zone;
+#endif /* CONFIG_NUMA */
+
+ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+kernel_zone:
+ return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
+}
+
static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -717,6 +905,9 @@ struct zone *zone_for_pfn_range(int onli
if (online_type == MMOP_ONLINE_MOVABLE)
return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+ if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
+ return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages);
+
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}