@@ -86,6 +86,7 @@ static DEFINE_XARRAY(memory_blocks);
* Memory groups, indexed by memory group id (mgid).
*/
static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
+#define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1
static BLOCKING_NOTIFIER_HEAD(memory_chain);
@@ -939,6 +940,8 @@ static int memory_group_register(struct
if (ret) {
kfree(new_group);
return ret;
+ } else if (group.is_dynamic) {
+ xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
}
return mgid;
}
@@ -1044,3 +1047,30 @@ struct memory_group *memory_group_find_b
{
return xa_load(&memory_groups, mgid);
}
+
+/*
+ * This is an internal helper only to be used in core memory hotplug code to
+ * walk all dynamic memory groups excluding a given memory group, either
+ * belonging to a specific node, or belonging to any node.
+ */
+int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
+ struct memory_group *excluded, void *arg)
+{
+ struct memory_group *group;
+ unsigned long index;
+ int ret = 0;
+
+ xa_for_each_marked(&memory_groups, index, group,
+ MEMORY_GROUP_MARK_DYNAMIC) {
+ if (group == excluded)
+ continue;
+#ifdef CONFIG_NUMA
+ if (nid != NUMA_NO_NODE && group->nid != nid)
+ continue;
+#endif /* CONFIG_NUMA */
+ ret = func(group, arg);
+ if (ret)
+ break;
+ }
+ return ret;
+}
@@ -146,6 +146,9 @@ extern int memory_group_register_static(
extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
extern int memory_group_unregister(int mgid);
struct memory_group *memory_group_find_by_id(int mgid);
+typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
+int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
+ struct memory_group *excluded, void *arg);
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -752,11 +752,44 @@ static void auto_movable_stats_account_z
#endif /* CONFIG_CMA */
}
}
+struct auto_movable_group_stats {
+ unsigned long movable_pages;
+ unsigned long req_kernel_early_pages;
+};
-static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages)
+static int auto_movable_stats_account_group(struct memory_group *group,
+ void *arg)
+{
+ const int ratio = READ_ONCE(auto_movable_ratio);
+ struct auto_movable_group_stats *stats = arg;
+ long pages;
+
+ /*
+ * We don't support modifying the config while the auto-movable online
+ * policy is already enabled. Just avoid the division by zero below.
+ */
+ if (!ratio)
+ return 0;
+
+ /*
+ * Calculate how many early kernel pages this group requires to
+ * satisfy the configured zone ratio.
+ */
+ pages = group->present_movable_pages * 100 / ratio;
+ pages -= group->present_kernel_pages;
+
+ if (pages > 0)
+ stats->req_kernel_early_pages += pages;
+ stats->movable_pages += group->present_movable_pages;
+ return 0;
+}
+
+static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
+ unsigned long nr_pages)
{
- struct auto_movable_stats stats = {};
unsigned long kernel_early_pages, movable_pages;
+ struct auto_movable_group_stats group_stats = {};
+ struct auto_movable_stats stats = {};
pg_data_t *pgdat = NODE_DATA(nid);
struct zone *zone;
int i;
@@ -778,6 +811,21 @@ static bool auto_movable_can_online_mova
movable_pages = stats.movable_pages;
/*
+ * Kernel memory inside dynamic memory group allows for more MOVABLE
+ * memory within the same group. Remove the effect of all but the
+ * current group from the stats.
+ */
+ walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
+ group, &group_stats);
+ if (kernel_early_pages <= group_stats.req_kernel_early_pages)
+ return false;
+ kernel_early_pages -= group_stats.req_kernel_early_pages;
+ movable_pages -= group_stats.movable_pages;
+
+ if (group && group->is_dynamic)
+ kernel_early_pages += group->present_kernel_pages;
+
+ /*
* Test if we could online the given number of pages to ZONE_MOVABLE
* and still stay in the configured ratio.
*/
@@ -834,6 +882,10 @@ static struct zone *default_kernel_zone_
* with unmovable allocations). While there are corner cases where it might
* still work, it is barely relevant in practice.
*
+ * Exceptions are dynamic memory groups, which allow for more MOVABLE
+ * memory within the same memory group -- because in that case, there is
+ * coordination within the single memory device managed by a single driver.
+ *
* We rely on "present pages" instead of "managed pages", as the latter is
* highly unreliable and dynamic in virtualized environments, and does not
* consider boot time allocations. For example, memory ballooning adjusts the
@@ -899,12 +951,12 @@ static struct zone *auto_movable_zone_fo
* nobody interferes, all will be MOVABLE if possible.
*/
nr_pages = max_pages - online_pages;
- if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages))
+ if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
goto kernel_zone;
#ifdef CONFIG_NUMA
if (auto_movable_numa_aware &&
- !auto_movable_can_online_movable(nid, nr_pages))
+ !auto_movable_can_online_movable(nid, group, nr_pages))
goto kernel_zone;
#endif /* CONFIG_NUMA */