diff mbox series

[-V11,2/9] mm/migrate: update node demotion order on hotplug events

Message ID 20210721063926.3024591-2-ying.huang@intel.com (mailing list archive)
State New
Headers show
Series [-V11,1/9] mm/numa: automatically generate node migration order | expand

Commit Message

Huang, Ying July 21, 2021, 6:39 a.m. UTC
From: Dave Hansen <dave.hansen@linux.intel.com>

Reclaim-based migration is attempting to optimize data placement in memory
based on the system topology.  If the system changes, so must the
migration ordering.

The implementation is conceptually simple and entirely unoptimized.  On
any memory or CPU hotplug events, assume that a node was added or removed
and recalculate all migration targets.  This ensures that the
node_demotion[] array is always ready to be used in case the new reclaim
mode is enabled.

This recalculation is far from optimal, most glaringly that it does not
even attempt to figure out the hotplug event would have some *actual*
effect on the demotion order.  But, given the expected paucity of hotplug
events, this should be fine.

Link: https://lkml.kernel.org/r/20210715055145.195411-3-ying.huang@intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/migrate.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

Comments

Abhishek Goel Feb. 23, 2022, 11:02 p.m. UTC | #1
Hi Dave,


> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> Reclaim-based migration is attempting to optimize data placement in memory
> based on the system topology.  If the system changes, so must the
> migration ordering.
>
> The implementation is conceptually simple and entirely unoptimized.  On
> any memory or CPU hotplug events, assume that a node was added or removed
> and recalculate all migration targets.  This ensures that the
> node_demotion[] array is always ready to be used in case the new reclaim
> mode is enabled.
>
> This recalculation is far from optimal, most glaringly that it does not
> even attempt to figure out the hotplug event would have some *actual*
> effect on the demotion order.  But, given the expected paucity of hotplug
> events, this should be fine.
>
> Link: https://lkml.kernel.org/r/20210715055145.195411-3-ying.huang@intel.com
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Reviewed-by: Yang Shi <shy828301@gmail.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Wei Xu <weixugc@google.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Greg Thelen <gthelen@google.com>
> Cc: Keith Busch <kbusch@kernel.org>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>   mm/migrate.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 89 insertions(+), 1 deletion(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index b7a40ab47648..a40c391f9ca7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -49,6 +49,7 @@
>   #include <linux/sched/mm.h>
>   #include <linux/ptrace.h>
>   #include <linux/oom.h>
> +#include <linux/memory.h>
>   
>   #include <asm/tlbflush.h>
>   
> @@ -3057,6 +3058,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
>   EXPORT_SYMBOL(migrate_vma_finalize);
>   #endif /* CONFIG_DEVICE_PRIVATE */
>   
> +#if defined(CONFIG_MEMORY_HOTPLUG)
>   /* Disable reclaim-based migration. */
>   static void __disable_all_migrate_targets(void)
>   {
> @@ -3191,10 +3193,96 @@ static void __set_migration_target_nodes(void)
>   /*
>    * For callers that do not hold get_online_mems() already.
>    */
> -__maybe_unused // <- temporay to prevent warnings during bisects
>   static void set_migration_target_nodes(void)
>   {
>   	get_online_mems();
>   	__set_migration_target_nodes();
>   	put_online_mems();
>   }
> +
> +/*
> + * React to hotplug events that might affect the migration targets
> + * like events that online or offline NUMA nodes.
> + *
> + * The ordering is also currently dependent on which nodes have
> + * CPUs.  That means we need CPU on/offline notification too.
> + */
> +static int migration_online_cpu(unsigned int cpu)
> +{
> +	set_migration_target_nodes();
> +	return 0;
> +}
> +
> +static int migration_offline_cpu(unsigned int cpu)
> +{
> +	set_migration_target_nodes();
> +	return 0;
> +}
> +
> +/*
> + * This leaves migrate-on-reclaim transiently disabled between
> + * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
> + * whether reclaim-based migration is enabled or not, which
> + * ensures that the user can turn reclaim-based migration at
> + * any time without needing to recalculate migration targets.
> + *
> + * These callbacks already hold get_online_mems().  That is why
> + * __set_migration_target_nodes() can be used as opposed to
> + * set_migration_target_nodes().
> + */
> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> +						 unsigned long action, void *arg)
> +{
> +	switch (action) {
> +	case MEM_GOING_OFFLINE:
> +		/*
> +		 * Make sure there are not transient states where
> +		 * an offline node is a migration target.  This
> +		 * will leave migration disabled until the offline
> +		 * completes and the MEM_OFFLINE case below runs.
> +		 */
> +		disable_all_migrate_targets();
> +		break;
> +	case MEM_OFFLINE:
> +	case MEM_ONLINE:
> +		/*
> +		 * Recalculate the target nodes once the node
> +		 * reaches its final state (online or offline).
> +		 */
> +		__set_migration_target_nodes();
> +		break;
> +	case MEM_CANCEL_OFFLINE:
> +		/*
> +		 * MEM_GOING_OFFLINE disabled all the migration
> +		 * targets.  Reenable them.
> +		 */
> +		__set_migration_target_nodes();
> +		break;
> +	case MEM_GOING_ONLINE:
> +	case MEM_CANCEL_ONLINE:
> +		break;
> +	}
> +
> +	return notifier_from_errno(0);
> +}
> +
> +static int __init migrate_on_reclaim_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
> +				migration_online_cpu,
> +				migration_offline_cpu);
> +	/*
> +	 * In the unlikely case that this fails, the automatic
> +	 * migration targets may become suboptimal for nodes
> +	 * where N_CPU changes.  With such a small impact in a
> +	 * rare case, do not bother trying to do anything special.
> +	 */
> +	WARN_ON(ret < 0);
> +
> +	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> +	return 0;
> +}
> +late_initcall(migrate_on_reclaim_init);
> +#endif /* CONFIG_MEMORY_HOTPLUG */
I intend to report a issue that is being caused by this patch.
 From 5.14 to 5.15 kernel, hotplug on power systems was observed to be
taking double the expected time. Git bisect between these two kernels
points to this patch as the one causing the issue.
I have verified from cpu-hotplug callback trace that we are infact
spending a lot of time in migration_offline_cpu code path.
I see that there have been subsequent patches to optimize this update
node demotion order code, but those patches are already in 5.15 kernel
and regressions are still observed even after those optimizations.
I have also recreated and observed the issue across systems with
different configs, and across different kernels containing this patch.
If needed, I will provide experiment results and traces that were used
to conclude this.

Regards,

- Abhishek
Dave Hansen Feb. 24, 2022, 12:05 a.m. UTC | #2
On 2/23/22 15:02, Abhishek Goel wrote:
> If needed, I will provide experiment results and traces that were used
> to conclude this.

It would be great if you can provide some more info.  Even just a CPU
time profile would be helpful.

It would also be great to understand more about what "hotplug on power
systems" actually means.  Is this a synthetic benchmark, or are actual
end-users running into this issue?  Are entire nodes of CPUs going
offline?  Or is this just doing an offline/online of CPU 22 in a 100-CPU
NUMA node?
Abhishek Goel Feb. 24, 2022, 11:37 p.m. UTC | #3
On 24/02/22 05:35, Dave Hansen wrote:
> On 2/23/22 15:02, Abhishek Goel wrote:
>> If needed, I will provide experiment results and traces that were used
>> to conclude this.
> It would be great if you can provide some more info.  Even just a CPU
> time profile would be helpful.

Average total time taken for SMT=8 to SMT=1 in v5.14 : 20s

Average total time taken for SMT=8 to SMT=1 in v5.15 : 36s

(Observed in system with 150+ CPUs )

>
> It would also be great to understand more about what "hotplug on power
> systems" actually means.  Is this a synthetic benchmark, or are actual
> end-users running into this issue?  Are entire nodes of CPUs going
> offline?  Or is this just doing an offline/online of CPU 22 in a 100-CPU
> NUMA node?
No, this is not a synthetic benchmark. This can be recreated with
entire nodes of CPUs going offline. And the online/offline operations
have been performed by simple scripts. The time observed can also be
verified (for individual CPU or the entire system) by observing CPU-
Hotplug trace which provide consistent result as observed by using
the scripts.
Dave Hansen Feb. 25, 2022, 12:49 a.m. UTC | #4
On 2/24/22 15:37, Abhishek Goel wrote:
> 
> On 24/02/22 05:35, Dave Hansen wrote:
>> On 2/23/22 15:02, Abhishek Goel wrote:
>>> If needed, I will provide experiment results and traces that were used
>>> to conclude this.
>> It would be great if you can provide some more info.  Even just a CPU
>> time profile would be helpful.
> 
> Average total time taken for SMT=8 to SMT=1 in v5.14 : 20s
> 
> Average total time taken for SMT=8 to SMT=1 in v5.15 : 36s
> 
> (Observed in system with 150+ CPUs )

I was kinda thinking of:

	perf record / perf report

output.  Not wall clock time.  We need to know what the kernel is doing
during those extra 16 seconds.
Huang, Ying Feb. 25, 2022, 2:32 a.m. UTC | #5
Hi, Abhishek,

Abhishek Goel <huntbag@linux.vnet.ibm.com> writes:

> On 24/02/22 05:35, Dave Hansen wrote:
>> On 2/23/22 15:02, Abhishek Goel wrote:
>>> If needed, I will provide experiment results and traces that were used
>>> to conclude this.
>> It would be great if you can provide some more info.  Even just a CPU
>> time profile would be helpful.
>
> Average total time taken for SMT=8 to SMT=1 in v5.14 : 20s
>
> Average total time taken for SMT=8 to SMT=1 in v5.15 : 36s
>
> (Observed in system with 150+ CPUs )

We have run into a memory hotplug regression before.  Let's check
whether the problem is similar.  Can you try the below debug patch?

Best Regards,
Huang, Ying

----------------------------8<------------------------------------------
From 500c0b53436b7a697ed5d77241abbc0d5d3cfc07 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 29 Sep 2021 10:57:19 +0800
Subject: [PATCH] mm/migrate: Debug CPU hotplug regression

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
---
 mm/migrate.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..c4805f15e616 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3261,15 +3261,17 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
  * The ordering is also currently dependent on which nodes have
  * CPUs.  That means we need CPU on/offline notification too.
  */
-static int migration_online_cpu(unsigned int cpu)
+static int migration_cpu_hotplug(unsigned int cpu)
 {
-	set_migration_target_nodes();
-	return 0;
-}
+	static int nr_cpu_node_saved;
+	int nr_cpu_node;
+
+	nr_cpu_node = num_node_state(N_CPU);
+	if (nr_cpu_node != nr_cpu_node_saved) {
+		set_migration_target_nodes();
+		nr_cpu_node_saved = nr_cpu_node;
+	}
 
-static int migration_offline_cpu(unsigned int cpu)
-{
-	set_migration_target_nodes();
 	return 0;
 }
 
@@ -3283,7 +3285,7 @@ static int __init migrate_on_reclaim_init(void)
 	WARN_ON(!node_demotion);
 
 	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
-					NULL, migration_offline_cpu);
+					NULL, migration_cpu_hotplug);
 	/*
 	 * In the unlikely case that this fails, the automatic
 	 * migration targets may become suboptimal for nodes
@@ -3292,7 +3294,7 @@ static int __init migrate_on_reclaim_init(void)
 	 */
 	WARN_ON(ret < 0);
 	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
-				migration_online_cpu, NULL);
+				migration_cpu_hotplug, NULL);
 	WARN_ON(ret < 0);
 
 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
Abhishek Goel Feb. 25, 2022, 8:35 p.m. UTC | #6
Hi Huang,

On 25/02/22 08:02, Huang, Ying wrote:
>
> We have run into a memory hotplug regression before.  Let's check
> whether the problem is similar.  Can you try the below debug patch?
>
> Best Regards,
> Huang, Ying
>
> ----------------------------8<------------------------------------------
>  From 500c0b53436b7a697ed5d77241abbc0d5d3cfc07 Mon Sep 17 00:00:00 2001
> From: Huang Ying <ying.huang@intel.com>
> Date: Wed, 29 Sep 2021 10:57:19 +0800
> Subject: [PATCH] mm/migrate: Debug CPU hotplug regression
>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> ---
>   mm/migrate.c | 20 +++++++++++---------
>   1 file changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..c4805f15e616 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3261,15 +3261,17 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>    * The ordering is also currently dependent on which nodes have
>    * CPUs.  That means we need CPU on/offline notification too.
>    */
> -static int migration_online_cpu(unsigned int cpu)
> +static int migration_cpu_hotplug(unsigned int cpu)
>   {
> -	set_migration_target_nodes();
> -	return 0;
> -}
> +	static int nr_cpu_node_saved;
> +	int nr_cpu_node;
> +
> +	nr_cpu_node = num_node_state(N_CPU);
> +	if (nr_cpu_node != nr_cpu_node_saved) {
> +		set_migration_target_nodes();
> +		nr_cpu_node_saved = nr_cpu_node;
> +	}
>
> -static int migration_offline_cpu(unsigned int cpu)
> -{
> -	set_migration_target_nodes();
>   	return 0;
>   }
>
> @@ -3283,7 +3285,7 @@ static int __init migrate_on_reclaim_init(void)
>   	WARN_ON(!node_demotion);
>
>   	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> -					NULL, migration_offline_cpu);
> +					NULL, migration_cpu_hotplug);
>   	/*
>   	 * In the unlikely case that this fails, the automatic
>   	 * migration targets may become suboptimal for nodes
> @@ -3292,7 +3294,7 @@ static int __init migrate_on_reclaim_init(void)
>   	 */
>   	WARN_ON(ret < 0);
>   	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> -				migration_online_cpu, NULL);
> +				migration_cpu_hotplug, NULL);
>   	WARN_ON(ret < 0);
>
>   	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
This works. Applied this on 5.15 kernel and don't see any regression
compared to 5.14 kernel.
So, Have you posted this patch yet? Or any plans on inclusion of any
similar patch?
Oscar Salvador March 8, 2022, 10:27 a.m. UTC | #7
On Fri, Feb 25, 2022 at 10:32:20AM +0800, Huang, Ying wrote:
> -static int migration_online_cpu(unsigned int cpu)
> +static int migration_cpu_hotplug(unsigned int cpu)
>  {
> -	set_migration_target_nodes();
> -	return 0;
> -}
> +	static int nr_cpu_node_saved;
> +	int nr_cpu_node;
> +
> +	nr_cpu_node = num_node_state(N_CPU);
> +	if (nr_cpu_node != nr_cpu_node_saved) {
> +		set_migration_target_nodes();
> +		nr_cpu_node_saved = nr_cpu_node;
> +	}
>  
> -static int migration_offline_cpu(unsigned int cpu)
> -{
> -	set_migration_target_nodes();
>  	return 0;
>  }

These callbacks feel like re-inveting the wheel.
We do already have two functions that get called during cpu
online/offline, and that sets/clears N_CPU on the node properly.
And that is exactly what we want, so what about the following (only
compile-tested):

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da..031af2bb71dc 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -48,6 +48,7 @@ int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);

 extern bool numa_demotion_enabled;
+extern void set_migration_target_nodes(void);
 #else

 static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..7847e4de01d7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3190,7 +3190,7 @@ static void __set_migration_target_nodes(void)
 /*
  * For callers that do not hold get_online_mems() already.
  */
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
 {
 	get_online_mems();
 	__set_migration_target_nodes();
@@ -3254,47 +3254,13 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
 	return notifier_from_errno(0);
 }

-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs.  That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
-	set_migration_target_nodes();
-	return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
-{
-	set_migration_target_nodes();
-	return 0;
-}
-
 static int __init migrate_on_reclaim_init(void)
 {
-	int ret;
-
 	node_demotion = kmalloc_array(nr_node_ids,
 				      sizeof(struct demotion_nodes),
 				      GFP_KERNEL);
 	WARN_ON(!node_demotion);

-	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
-					NULL, migration_offline_cpu);
-	/*
-	 * In the unlikely case that this fails, the automatic
-	 * migration targets may become suboptimal for nodes
-	 * where N_CPU changes.  With such a small impact in a
-	 * rare case, do not bother trying to do anything special.
-	 */
-	WARN_ON(ret < 0);
-	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
-				migration_online_cpu, NULL);
-	WARN_ON(ret < 0);
-
 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
 	return 0;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4057372745d0..0529a83c8f89 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_ext.h>
 #include <linux/page_owner.h>
+#include <linux/migrate.h>

 #include "internal.h"

@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
 static int vmstat_cpu_online(unsigned int cpu)
 {
 	refresh_zone_stat_thresholds();
-	node_set_state(cpu_to_node(cpu), N_CPU);
+
+	if (!node_state(cpu_to_node(cpu), N_CPU)) {
+		node_set_state(cpu_to_node(cpu), N_CPU);
+		set_migration_target_nodes();
+	}
+
 	return 0;
 }

@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
 		return 0;

 	node_clear_state(node, N_CPU);
+	set_migration_target_nodes();
+
 	return 0;
 }

I think this is just easier and meets exactly the goal.

We could go even further and move the work left in
migrate_on_reclaim_init() to init_mm_internals().
(I __think__ we should be fine because there is no dependency
there, e.g: notifier being set up somewhere later after
init_mm_internals() has been called).
Dave Hansen March 8, 2022, 5:07 p.m. UTC | #8
On 3/8/22 02:27, Oscar Salvador wrote:
> @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
>  static int vmstat_cpu_online(unsigned int cpu)
>  {
>  	refresh_zone_stat_thresholds();
> -	node_set_state(cpu_to_node(cpu), N_CPU);
> +
> +	if (!node_state(cpu_to_node(cpu), N_CPU)) {
> +		node_set_state(cpu_to_node(cpu), N_CPU);
> +		set_migration_target_nodes();
> +	}
> +
>  	return 0;
>  }
> 
> @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
>  		return 0;
> 
>  	node_clear_state(node, N_CPU);
> +	set_migration_target_nodes();
> +
>  	return 0;
>  }

Yeah, those callbacks do look like they're reinventing the wheel.  This
is a much more direct way of doing it.
Oscar Salvador March 8, 2022, 6:56 p.m. UTC | #9
On Tue, Mar 08, 2022 at 09:07:20AM -0800, Dave Hansen wrote:
> On 3/8/22 02:27, Oscar Salvador wrote:
> > @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
> >  static int vmstat_cpu_online(unsigned int cpu)
> >  {
> >  	refresh_zone_stat_thresholds();
> > -	node_set_state(cpu_to_node(cpu), N_CPU);
> > +
> > +	if (!node_state(cpu_to_node(cpu), N_CPU)) {
> > +		node_set_state(cpu_to_node(cpu), N_CPU);
> > +		set_migration_target_nodes();
> > +	}
> > +
> >  	return 0;
> >  }
> > 
> > @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
> >  		return 0;
> > 
> >  	node_clear_state(node, N_CPU);
> > +	set_migration_target_nodes();
> > +
> >  	return 0;
> >  }
> 
> Yeah, those callbacks do look like they're reinventing the wheel.  This
> is a much more direct way of doing it.

Then let me play a bit more with it and I can cook a patch unless
someone feels strong against it.

Thanks
Huang, Ying March 9, 2022, 12:29 a.m. UTC | #10
Oscar Salvador <osalvador@suse.de> writes:

> On Tue, Mar 08, 2022 at 09:07:20AM -0800, Dave Hansen wrote:
>> On 3/8/22 02:27, Oscar Salvador wrote:
>> > @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
>> >  static int vmstat_cpu_online(unsigned int cpu)
>> >  {
>> >  	refresh_zone_stat_thresholds();
>> > -	node_set_state(cpu_to_node(cpu), N_CPU);
>> > +
>> > +	if (!node_state(cpu_to_node(cpu), N_CPU)) {
>> > +		node_set_state(cpu_to_node(cpu), N_CPU);
>> > +		set_migration_target_nodes();
>> > +	}
>> > +
>> >  	return 0;
>> >  }
>> > 
>> > @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
>> >  		return 0;
>> > 
>> >  	node_clear_state(node, N_CPU);
>> > +	set_migration_target_nodes();
>> > +
>> >  	return 0;
>> >  }
>> 
>> Yeah, those callbacks do look like they're reinventing the wheel.  This
>> is a much more direct way of doing it.
>
> Then let me play a bit more with it and I can cook a patch unless
> someone feels strong against it.

This looks good to me, Thanks!

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/mm/migrate.c b/mm/migrate.c
index b7a40ab47648..a40c391f9ca7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -49,6 +49,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/ptrace.h>
 #include <linux/oom.h>
+#include <linux/memory.h>
 
 #include <asm/tlbflush.h>
 
@@ -3057,6 +3058,7 @@  void migrate_vma_finalize(struct migrate_vma *migrate)
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
+#if defined(CONFIG_MEMORY_HOTPLUG)
 /* Disable reclaim-based migration. */
 static void __disable_all_migrate_targets(void)
 {
@@ -3191,10 +3193,96 @@  static void __set_migration_target_nodes(void)
 /*
  * For callers that do not hold get_online_mems() already.
  */
-__maybe_unused // <- temporay to prevent warnings during bisects
 static void set_migration_target_nodes(void)
 {
 	get_online_mems();
 	__set_migration_target_nodes();
 	put_online_mems();
 }
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs.  That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+	set_migration_target_nodes();
+	return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+	set_migration_target_nodes();
+	return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems().  That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+						 unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		/*
+		 * Make sure there are not transient states where
+		 * an offline node is a migration target.  This
+		 * will leave migration disabled until the offline
+		 * completes and the MEM_OFFLINE case below runs.
+		 */
+		disable_all_migrate_targets();
+		break;
+	case MEM_OFFLINE:
+	case MEM_ONLINE:
+		/*
+		 * Recalculate the target nodes once the node
+		 * reaches its final state (online or offline).
+		 */
+		__set_migration_target_nodes();
+		break;
+	case MEM_CANCEL_OFFLINE:
+		/*
+		 * MEM_GOING_OFFLINE disabled all the migration
+		 * targets.  Reenable them.
+		 */
+		__set_migration_target_nodes();
+		break;
+	case MEM_GOING_ONLINE:
+	case MEM_CANCEL_ONLINE:
+		break;
+	}
+
+	return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+				migration_online_cpu,
+				migration_offline_cpu);
+	/*
+	 * In the unlikely case that this fails, the automatic
+	 * migration targets may become suboptimal for nodes
+	 * where N_CPU changes.  With such a small impact in a
+	 * rare case, do not bother trying to do anything special.
+	 */
+	WARN_ON(ret < 0);
+
+	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+	return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */