diff mbox series

[RFC,06/13] mm/migrate: update migration order during on hotplug events

Message ID 20210126003423.8D2B5637@viggo.jf.intel.com (mailing list archive)
State New
Headers show
Series Migrate Pages in lieu of discard | expand

Commit Message

Dave Hansen Jan. 26, 2021, 12:34 a.m. UTC
From: Dave Hansen <dave.hansen@linux.intel.com>

Reclaim-based migration is attempting to optimize data placement in
memory based on the system topology.  If the system changes, so must
the migration ordering.

The implementation here is pretty simple and entirely unoptimized.  On
any memory or CPU hotplug events, assume that a node was added or
removed and recalculate all migration targets.  This ensures that the
node_demotion[] array is always ready to be used in case the new
reclaim mode is enabled.

This recalculation is far from optimal, most glaringly that it does
not even attempt to figure out if nodes are actually coming or going.
But, given the expected paucity of hotplug events, this should be
fine.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: osalvador <osalvador@suse.de>
---

 b/mm/migrate.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 2 deletions(-)

Comments

Yang Shi Jan. 29, 2021, 8:59 p.m. UTC | #1
On Mon, Jan 25, 2021 at 4:41 PM Dave Hansen <dave.hansen@linux.intel.com> wrote:
>
>
> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> Reclaim-based migration is attempting to optimize data placement in
> memory based on the system topology.  If the system changes, so must
> the migration ordering.
>
> The implementation here is pretty simple and entirely unoptimized.  On
> any memory or CPU hotplug events, assume that a node was added or
> removed and recalculate all migration targets.  This ensures that the
> node_demotion[] array is always ready to be used in case the new
> reclaim mode is enabled.
>
> This recalculation is far from optimal, most glaringly that it does
> not even attempt to figure out if nodes are actually coming or going.
> But, given the expected paucity of hotplug events, this should be
> fine.
>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: osalvador <osalvador@suse.de>
> ---
>
>  b/mm/migrate.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 95 insertions(+), 2 deletions(-)
>
> diff -puN mm/migrate.c~enable-numa-demotion mm/migrate.c
> --- a/mm/migrate.c~enable-numa-demotion 2021-01-25 16:23:11.850866703 -0800
> +++ b/mm/migrate.c      2021-01-25 16:23:11.855866703 -0800
> @@ -49,6 +49,7 @@
>  #include <linux/sched/mm.h>
>  #include <linux/ptrace.h>
>  #include <linux/oom.h>
> +#include <linux/memory.h>
>
>  #include <asm/tlbflush.h>
>
> @@ -3135,6 +3136,7 @@ void migrate_vma_finalize(struct migrate
>  EXPORT_SYMBOL(migrate_vma_finalize);
>  #endif /* CONFIG_DEVICE_PRIVATE */
>
> +#if defined(CONFIG_MEMORY_HOTPLUG)
>  /* Disable reclaim-based migration. */
>  static void disable_all_migrate_targets(void)
>  {
> @@ -3191,7 +3193,7 @@ static int establish_migrate_target(int
>   * with itself.  Exclusion is provided by memory hotplug events
>   * being single-threaded.
>   */
> -void __set_migration_target_nodes(void)
> +static void __set_migration_target_nodes(void)
>  {
>         nodemask_t next_pass    = NODE_MASK_NONE;
>         nodemask_t this_pass    = NODE_MASK_NONE;
> @@ -3253,9 +3255,100 @@ again:
>                 goto again;
>  }
>
> -void set_migration_target_nodes(void)
> +/*
> + * For callers that do not hold get_online_mems() already.
> + */
> +static void set_migration_target_nodes(void)

Aha, it is changed to static here. I think this hunk should be folded
into the previous patch.

>  {
>         get_online_mems();
>         __set_migration_target_nodes();
>         put_online_mems();
>  }
> +
> +/*
> + * React to hotplug events that might affect the migration targes

s/targes/targets

> + * like events that online or offline NUMA nodes.
> + *
> + * The ordering is also currently dependent on which nodes have
> + * CPUs.  That means we need CPU on/offline notification too.
> + */
> +static int migration_online_cpu(unsigned int cpu)
> +{
> +       set_migration_target_nodes();
> +       return 0;
> +}
> +
> +static int migration_offline_cpu(unsigned int cpu)
> +{
> +       set_migration_target_nodes();
> +       return 0;
> +}
> +
> +/*
> + * This leaves migrate-on-reclaim transiently disabled
> + * between the MEM_GOING_OFFLINE and MEM_OFFLINE events.
> + * This runs reclaim-based micgration is enabled or not.

s/micgration/migration

> + * This ensures that the user can turn reclaim-based
> + * migration at any time without needing to recalcuate

s/reclcuate/recalculate

> + * migration targets.
> + *
> + * These callbacks already hold get_online_mems().  That
> + * is why __set_migration_target_nodes() can be used as
> + * opposed to set_migration_target_nodes().
> + */
> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> +                                                unsigned long action, void *arg)
> +{
> +       switch (action) {
> +       case MEM_GOING_OFFLINE:
> +               /*
> +                * Make sure there are not transient states where
> +                * an offline node is a migration target.  This
> +                * will leave migration disabled until the offline
> +                * completes and the MEM_OFFLINE case below runs.
> +                */
> +               disable_all_migrate_targets();

Don't we need smp_wmb() here? In the previous patch the comment says
write memory barrier is needed to guarantee readers see the consistent
values. The the smp_wmb() is called by __set_migration_target_nodes().
So, it seems it'd better to move smp_wmb() into
disable_all_migrate_targets().

> +               break;
> +       case MEM_OFFLINE:
> +       case MEM_ONLINE:
> +               /*
> +                * Recalculate the target nodes once the node
> +                * reaches its final state (online or offline).
> +                */
> +               __set_migration_target_nodes();
> +               break;
> +       case MEM_CANCEL_OFFLINE:
> +               /*
> +                * MEM_GOING_OFFLINE disabled all the migration
> +                * targets.  Reenable them.
> +                */
> +               __set_migration_target_nodes();
> +               break;
> +       case MEM_GOING_ONLINE:
> +       case MEM_CANCEL_ONLINE:
> +               break;
> +       }
> +
> +       return notifier_from_errno(0);
> +}
> +
> +static int __init migrate_on_reclaim_init(void)
> +{
> +       int ret;
> +
> +       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
> +                               migration_online_cpu,
> +                               migration_offline_cpu);
> +       /*
> +        * In the unlikely case that this fails, the automatic
> +        * migration targets may become suboptimal for nodes
> +        * where N_CPU changes.  With such a small impact in a
> +        * rare case, do not bother trying to do anything special.
> +        */
> +       WARN_ON(ret < 0);
> +
> +       hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> +       return 0;
> +}
> +late_initcall(migrate_on_reclaim_init);
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> _
>
Oscar Salvador Feb. 2, 2021, 11:42 a.m. UTC | #2
On Mon, Jan 25, 2021 at 04:34:23PM -0800, Dave Hansen wrote:
> 
> From: Dave Hansen <dave.hansen@linux.intel.com>
> 
> Reclaim-based migration is attempting to optimize data placement in
> memory based on the system topology.  If the system changes, so must
> the migration ordering.
> 
> The implementation here is pretty simple and entirely unoptimized.  On
> any memory or CPU hotplug events, assume that a node was added or
> removed and recalculate all migration targets.  This ensures that the
> node_demotion[] array is always ready to be used in case the new
> reclaim mode is enabled.
> 
> This recalculation is far from optimal, most glaringly that it does
> not even attempt to figure out if nodes are actually coming or going.
> But, given the expected paucity of hotplug events, this should be
> fine.
> 
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: osalvador <osalvador@suse.de>
> ---

[...]

> +
> +/*
> + * React to hotplug events that might affect the migration targes
> + * like events that online or offline NUMA nodes.
> + *
> + * The ordering is also currently dependent on which nodes have
> + * CPUs.  That means we need CPU on/offline notification too.
> + */
> +static int migration_online_cpu(unsigned int cpu)
> +{
> +	set_migration_target_nodes();
> +	return 0;
> +}
> +
> +static int migration_offline_cpu(unsigned int cpu)
> +{
> +	set_migration_target_nodes();
> +	return 0;
> +}
> +
> +/*
> + * This leaves migrate-on-reclaim transiently disabled
> + * between the MEM_GOING_OFFLINE and MEM_OFFLINE events.
> + * This runs reclaim-based micgration is enabled or not.
> + * This ensures that the user can turn reclaim-based
> + * migration at any time without needing to recalcuate
> + * migration targets.
> + *
> + * These callbacks already hold get_online_mems().  That
> + * is why __set_migration_target_nodes() can be used as
> + * opposed to set_migration_target_nodes().
> + */
> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> +						 unsigned long action, void *arg)
> +{
> +	switch (action) {
> +	case MEM_GOING_OFFLINE:
> +		/*
> +		 * Make sure there are not transient states where
> +		 * an offline node is a migration target.  This
> +		 * will leave migration disabled until the offline
> +		 * completes and the MEM_OFFLINE case below runs.
> +		 */
> +		disable_all_migrate_targets();
> +		break;
> +	case MEM_OFFLINE:
> +	case MEM_ONLINE:
> +		/*
> +		 * Recalculate the target nodes once the node
> +		 * reaches its final state (online or offline).
> +		 */
> +		__set_migration_target_nodes();
> +		break;
> +	case MEM_CANCEL_OFFLINE:
> +		/*
> +		 * MEM_GOING_OFFLINE disabled all the migration
> +		 * targets.  Reenable them.
> +		 */
> +		__set_migration_target_nodes();
> +		break;
> +	case MEM_GOING_ONLINE:
> +	case MEM_CANCEL_ONLINE:
> +		break;
> +	}
> +
> +	return notifier_from_errno(0);
> +}

This looks good, and I kinda like it.
But in this case, all we care about is whether NUMA node does or does
not have memory, so we have to remove/added into the demotion list.
So, would make more sense to have a kinda helper in
node_states_{set,clear}_node that calls the respective functions
(disable_all_migrate_targets and __set_migration_target_nodes)?
Dave Hansen Feb. 9, 2021, 11:45 p.m. UTC | #3
On 2/2/21 3:42 AM, Oscar Salvador wrote:
>> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>> +						 unsigned long action, void *arg)
>> +{
>> +	switch (action) {
>> +	case MEM_GOING_OFFLINE:
>> +		/*
>> +		 * Make sure there are not transient states where
>> +		 * an offline node is a migration target.  This
>> +		 * will leave migration disabled until the offline
>> +		 * completes and the MEM_OFFLINE case below runs.
>> +		 */
>> +		disable_all_migrate_targets();
>> +		break;
>> +	case MEM_OFFLINE:
>> +	case MEM_ONLINE:
>> +		/*
>> +		 * Recalculate the target nodes once the node
>> +		 * reaches its final state (online or offline).
>> +		 */
>> +		__set_migration_target_nodes();
>> +		break;
>> +	case MEM_CANCEL_OFFLINE:
>> +		/*
>> +		 * MEM_GOING_OFFLINE disabled all the migration
>> +		 * targets.  Reenable them.
>> +		 */
>> +		__set_migration_target_nodes();
>> +		break;
>> +	case MEM_GOING_ONLINE:
>> +	case MEM_CANCEL_ONLINE:
>> +		break;
>> +	}
>> +
>> +	return notifier_from_errno(0);
>> +}
> This looks good, and I kinda like it.
> But in this case, all we care about is whether NUMA node does or does
> not have memory, so we have to remove/added into the demotion list.
> So, would make more sense to have a kinda helper in
> node_states_{set,clear}_node that calls the respective functions
> (disable_all_migrate_targets and __set_migration_target_nodes)?

Of, you're saying that we could do this in the hotplug code itself
instead of from a notifier?  I agree, we *could*.  That would be more
efficient.  But, I do like the idea of doing this from a notifier
because it's a bit less brittle.

Do you feel strongly about this one?
Oscar Salvador Feb. 10, 2021, 8:55 a.m. UTC | #4
On Tue, Feb 09, 2021 at 03:45:55PM -0800, Dave Hansen wrote:
> On 2/2/21 3:42 AM, Oscar Salvador wrote:
> >> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> >> +						 unsigned long action, void *arg)
> >> +{
> >> +	switch (action) {
> >> +	case MEM_GOING_OFFLINE:
> >> +		/*
> >> +		 * Make sure there are not transient states where
> >> +		 * an offline node is a migration target.  This
> >> +		 * will leave migration disabled until the offline
> >> +		 * completes and the MEM_OFFLINE case below runs.
> >> +		 */
> >> +		disable_all_migrate_targets();
> >> +		break;
> >> +	case MEM_OFFLINE:
> >> +	case MEM_ONLINE:
> >> +		/*
> >> +		 * Recalculate the target nodes once the node
> >> +		 * reaches its final state (online or offline).
> >> +		 */
> >> +		__set_migration_target_nodes();
> >> +		break;
> >> +	case MEM_CANCEL_OFFLINE:
> >> +		/*
> >> +		 * MEM_GOING_OFFLINE disabled all the migration
> >> +		 * targets.  Reenable them.
> >> +		 */
> >> +		__set_migration_target_nodes();
> >> +		break;
> >> +	case MEM_GOING_ONLINE:
> >> +	case MEM_CANCEL_ONLINE:
> >> +		break;
> >> +	}
> >> +
> >> +	return notifier_from_errno(0);
> >> +}
> > This looks good, and I kinda like it.
> > But in this case, all we care about is whether NUMA node does or does
> > not have memory, so we have to remove/added into the demotion list.
> > So, would make more sense to have a kinda helper in
> > node_states_{set,clear}_node that calls the respective functions
> > (disable_all_migrate_targets and __set_migration_target_nodes)?
> 
> Of, you're saying that we could do this in the hotplug code itself
> instead of from a notifier?  I agree, we *could*.  That would be more
> efficient.  But, I do like the idea of doing this from a notifier
> because it's a bit less brittle.
> 
> Do you feel strongly about this one?

Hi Dave,

No, I do not. I even had mixed feelings myself when suggesting this as well.
As you said, it would be more optimum, but it feels kinda wrong placing the
call directly in hotplug code.

So all in all, I think your approach is more neat and clean, and more than
enough for now.

I yet have to dive in the details, but I got one more question.
Can we have CONFIG_MEMORY_HOTPLUG && !CONFIG_HOTPLUG_CPU scenarios?
I wonder because I do not see a stub function in case CONFIG_HOTPLUG_CPU
is not enabled, so I guess we cannot.

I am asking this because migrate_on_reclaim_callback() is envolved
with CONFIG_MEMORY_HOTPLUG, but calls cpuhp_setup_state, and I was not
sure whether we would have some dependency here?

Thanks
diff mbox series

Patch

diff -puN mm/migrate.c~enable-numa-demotion mm/migrate.c
--- a/mm/migrate.c~enable-numa-demotion	2021-01-25 16:23:11.850866703 -0800
+++ b/mm/migrate.c	2021-01-25 16:23:11.855866703 -0800
@@ -49,6 +49,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/ptrace.h>
 #include <linux/oom.h>
+#include <linux/memory.h>
 
 #include <asm/tlbflush.h>
 
@@ -3135,6 +3136,7 @@  void migrate_vma_finalize(struct migrate
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
+#if defined(CONFIG_MEMORY_HOTPLUG)
 /* Disable reclaim-based migration. */
 static void disable_all_migrate_targets(void)
 {
@@ -3191,7 +3193,7 @@  static int establish_migrate_target(int
  * with itself.  Exclusion is provided by memory hotplug events
  * being single-threaded.
  */
-void __set_migration_target_nodes(void)
+static void __set_migration_target_nodes(void)
 {
 	nodemask_t next_pass	= NODE_MASK_NONE;
 	nodemask_t this_pass	= NODE_MASK_NONE;
@@ -3253,9 +3255,100 @@  again:
 		goto again;
 }
 
-void set_migration_target_nodes(void)
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
 {
 	get_online_mems();
 	__set_migration_target_nodes();
 	put_online_mems();
 }
+
+/*
+ * React to hotplug events that might affect the migration targes
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs.  That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+	set_migration_target_nodes();
+	return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+	set_migration_target_nodes();
+	return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled
+ * between the MEM_GOING_OFFLINE and MEM_OFFLINE events.
+ * This runs reclaim-based micgration is enabled or not.
+ * This ensures that the user can turn reclaim-based
+ * migration at any time without needing to recalcuate
+ * migration targets.
+ *
+ * These callbacks already hold get_online_mems().  That
+ * is why __set_migration_target_nodes() can be used as
+ * opposed to set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+						 unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		/*
+		 * Make sure there are not transient states where
+		 * an offline node is a migration target.  This
+		 * will leave migration disabled until the offline
+		 * completes and the MEM_OFFLINE case below runs.
+		 */
+		disable_all_migrate_targets();
+		break;
+	case MEM_OFFLINE:
+	case MEM_ONLINE:
+		/*
+		 * Recalculate the target nodes once the node
+		 * reaches its final state (online or offline).
+		 */
+		__set_migration_target_nodes();
+		break;
+	case MEM_CANCEL_OFFLINE:
+		/*
+		 * MEM_GOING_OFFLINE disabled all the migration
+		 * targets.  Reenable them.
+		 */
+		__set_migration_target_nodes();
+		break;
+	case MEM_GOING_ONLINE:
+	case MEM_CANCEL_ONLINE:
+		break;
+	}
+
+	return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+				migration_online_cpu,
+				migration_offline_cpu);
+	/*
+	 * In the unlikely case that this fails, the automatic
+	 * migration targets may become suboptimal for nodes
+	 * where N_CPU changes.  With such a small impact in a
+	 * rare case, do not bother trying to do anything special.
+	 */
+	WARN_ON(ret < 0);
+
+	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+	return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */