diff mbox series

[v2] mm: Only re-generate demotion targets when a numa node changes its N_CPU state

Message ID 20220310120749.23077-1-osalvador@suse.de (mailing list archive)
State New
Headers show
Series [v2] mm: Only re-generate demotion targets when a numa node changes its N_CPU state | expand

Commit Message

Oscar Salvador March 10, 2022, 12:07 p.m. UTC
Abhishek reported that after patch [1], hotplug operations are
taking ~double the expected time. [2]

The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
sets always call set_migration_target_nodes() whenever a CPU is brought
up/down.
But we only care about numa nodes going from having cpus to become
cpuless, and vice versa, as that influences the demotion_target order.

We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
that check exactly that, so get rid of the CPU callbacks in
migrate_on_reclaim_init() and only call set_migration_target_nodes() from
vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.

[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/

Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
v1 -> v2: - Fix prototype function declaration
          - Fix build error on RISC
          - Adressed feedback from Baolin and Huang
---
 include/linux/migrate.h |  8 ++++++++
 mm/migrate.c            | 41 +++++------------------------------------
 mm/vmstat.c             | 13 ++++++++++++-
 3 files changed, 25 insertions(+), 37 deletions(-)

Comments

Baolin Wang March 11, 2022, 1:33 a.m. UTC | #1
On 3/10/2022 8:07 PM, Oscar Salvador wrote:
> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
> 
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
> 
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
> 
> [1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
> [2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
> 
> Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
> v1 -> v2: - Fix prototype function declaration
>            - Fix build error on RISC
>            - Adressed feedback from Baolin and Huang
> ---
>   include/linux/migrate.h |  8 ++++++++
>   mm/migrate.c            | 41 +++++------------------------------------
>   mm/vmstat.c             | 13 ++++++++++++-
>   3 files changed, 25 insertions(+), 37 deletions(-)
> 
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index db96e10eb8da..90e75d5a54d6 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
>   		struct folio *newfolio, struct folio *folio, int extra_count);
>   
>   extern bool numa_demotion_enabled;
> +extern void migrate_on_reclaim_init(void);
> +#ifdef CONFIG_HOTPLUG_CPU
> +extern void set_migration_target_nodes(void);
>   #else
> +static inline void set_migration_target_nodes(void) {}
> +#endif
> +#else
> +
> +static inline void set_migration_target_nodes(void) {}
>   
>   static inline void putback_movable_pages(struct list_head *l) {}
>   static inline int migrate_pages(struct list_head *l, new_page_t new,
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..f9d5b6092a42 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3190,7 +3190,7 @@ static void __set_migration_target_nodes(void)
>   /*
>    * For callers that do not hold get_online_mems() already.
>    */
> -static void set_migration_target_nodes(void)
> +void set_migration_target_nodes(void)
>   {
>   	get_online_mems();
>   	__set_migration_target_nodes();
> @@ -3254,51 +3254,20 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>   	return notifier_from_errno(0);
>   }
>   
> -/*
> - * React to hotplug events that might affect the migration targets
> - * like events that online or offline NUMA nodes.
> - *
> - * The ordering is also currently dependent on which nodes have
> - * CPUs.  That means we need CPU on/offline notification too.
> - */
> -static int migration_online_cpu(unsigned int cpu)
> +void __init migrate_on_reclaim_init(void)
>   {
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int migration_offline_cpu(unsigned int cpu)
> -{
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int __init migrate_on_reclaim_init(void)
> -{
> -	int ret;
> -
>   	node_demotion = kmalloc_array(nr_node_ids,
>   				      sizeof(struct demotion_nodes),
>   				      GFP_KERNEL);
>   	WARN_ON(!node_demotion);
>   
> -	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> -					NULL, migration_offline_cpu);
>   	/*
> -	 * In the unlikely case that this fails, the automatic
> -	 * migration targets may become suboptimal for nodes
> -	 * where N_CPU changes.  With such a small impact in a
> -	 * rare case, do not bother trying to do anything special.
> +	 * At this point, all numa nodes with memory/CPus have their state
> +	 * properly set, so we can build the demotion order now.
>   	 */
> -	WARN_ON(ret < 0);
> -	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> -				migration_online_cpu, NULL);
> -	WARN_ON(ret < 0);
> -
> +	set_migration_target_nodes();
>   	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> -	return 0;
>   }
> -late_initcall(migrate_on_reclaim_init);
>   #endif /* CONFIG_HOTPLUG_CPU */
>   
>   bool numa_demotion_enabled = false;
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4057372745d0..9e9536df51b5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -28,6 +28,7 @@
>   #include <linux/mm_inline.h>
>   #include <linux/page_ext.h>
>   #include <linux/page_owner.h>
> +#include <linux/migrate.h>
>   
>   #include "internal.h"
>   
> @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
>   static int vmstat_cpu_online(unsigned int cpu)
>   {
>   	refresh_zone_stat_thresholds();
> -	node_set_state(cpu_to_node(cpu), N_CPU);
> +
> +	if (!node_state(cpu_to_node(cpu), N_CPU)) {
> +		node_set_state(cpu_to_node(cpu), N_CPU);
> +		set_migration_target_nodes();
> +	}
> +
>   	return 0;
>   }
>   
> @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
>   		return 0;
>   
>   	node_clear_state(node, N_CPU);
> +	set_migration_target_nodes();
> +
>   	return 0;
>   }
>   
> @@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
>   
>   	start_shepherd_timer();
>   #endif
> +#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
 > +	migrate_on_reclaim_init();

I guess can we move these CONFIG_* macros into migrate.h like what 
set_migration_target_nodes() does? Otherwise LGTM. Please feel free to add:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Huang, Ying March 11, 2022, 2:24 a.m. UTC | #2
Oscar Salvador <osalvador@suse.de> writes:

> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
>
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
>
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
>
> [1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
> [2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
>
> Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
> v1 -> v2: - Fix prototype function declaration
>           - Fix build error on RISC
>           - Adressed feedback from Baolin and Huang
> ---
>  include/linux/migrate.h |  8 ++++++++
>  mm/migrate.c            | 41 +++++------------------------------------
>  mm/vmstat.c             | 13 ++++++++++++-
>  3 files changed, 25 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index db96e10eb8da..90e75d5a54d6 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
>  		struct folio *newfolio, struct folio *folio, int extra_count);
>  
>  extern bool numa_demotion_enabled;
> +extern void migrate_on_reclaim_init(void);
> +#ifdef CONFIG_HOTPLUG_CPU
> +extern void set_migration_target_nodes(void);
>  #else
> +static inline void set_migration_target_nodes(void) {}
> +#endif
> +#else
> +
> +static inline void set_migration_target_nodes(void) {}
>  
>  static inline void putback_movable_pages(struct list_head *l) {}
>  static inline int migrate_pages(struct list_head *l, new_page_t new,
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..f9d5b6092a42 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3190,7 +3190,7 @@ static void __set_migration_target_nodes(void)
>  /*
>   * For callers that do not hold get_online_mems() already.
>   */
> -static void set_migration_target_nodes(void)
> +void set_migration_target_nodes(void)
>  {
>  	get_online_mems();
>  	__set_migration_target_nodes();
> @@ -3254,51 +3254,20 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>  	return notifier_from_errno(0);
>  }
>  
> -/*
> - * React to hotplug events that might affect the migration targets
> - * like events that online or offline NUMA nodes.
> - *
> - * The ordering is also currently dependent on which nodes have
> - * CPUs.  That means we need CPU on/offline notification too.
> - */
> -static int migration_online_cpu(unsigned int cpu)
> +void __init migrate_on_reclaim_init(void)
>  {
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int migration_offline_cpu(unsigned int cpu)
> -{
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int __init migrate_on_reclaim_init(void)
> -{
> -	int ret;
> -
>  	node_demotion = kmalloc_array(nr_node_ids,
>  				      sizeof(struct demotion_nodes),
>  				      GFP_KERNEL);
>  	WARN_ON(!node_demotion);
>  
> -	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> -					NULL, migration_offline_cpu);
>  	/*
> -	 * In the unlikely case that this fails, the automatic
> -	 * migration targets may become suboptimal for nodes
> -	 * where N_CPU changes.  With such a small impact in a
> -	 * rare case, do not bother trying to do anything special.
> +	 * At this point, all numa nodes with memory/CPus have their state
> +	 * properly set, so we can build the demotion order now.
>  	 */
> -	WARN_ON(ret < 0);
> -	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> -				migration_online_cpu, NULL);
> -	WARN_ON(ret < 0);
> -
> +	set_migration_target_nodes();
>  	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> -	return 0;
>  }
> -late_initcall(migrate_on_reclaim_init);
>  #endif /* CONFIG_HOTPLUG_CPU */
>  
>  bool numa_demotion_enabled = false;
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4057372745d0..9e9536df51b5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -28,6 +28,7 @@
>  #include <linux/mm_inline.h>
>  #include <linux/page_ext.h>
>  #include <linux/page_owner.h>
> +#include <linux/migrate.h>
>  
>  #include "internal.h"
>  
> @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
>  static int vmstat_cpu_online(unsigned int cpu)
>  {
>  	refresh_zone_stat_thresholds();
> -	node_set_state(cpu_to_node(cpu), N_CPU);
> +
> +	if (!node_state(cpu_to_node(cpu), N_CPU)) {
> +		node_set_state(cpu_to_node(cpu), N_CPU);
> +		set_migration_target_nodes();
> +	}
> +
>  	return 0;
>  }
>  
> @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
>  		return 0;
>  
>  	node_clear_state(node, N_CPU);
> +	set_migration_target_nodes();
> +
>  	return 0;
>  }
>  
> @@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
>  
>  	start_shepherd_timer();
>  #endif
> +#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
> +	migrate_on_reclaim_init();
> +#endif

It may be unnecessary to be fixed in this patch.  But I think we need to
cleanup the kernel config dependencies of the demotion code at some time.

1. Even if !defined(CONFIG_HOTPLUG_CPU) &&
   !defined(CONFIG_MEMORY_HOTPLUG), we still need to allocate
   "node_demotion" and call set_migration_target_nodes() during boot time.

2. If !defined(CONFIG_MEMORY_HOTPLUG), we don't need
   migrate_on_reclaim_callback().

3. We need defined(CONFIG_NUMA) && defined(CONFIG_MIGRATION) for all
   these code.

Best Regards,
Huang, Ying

>  #ifdef CONFIG_PROC_FS
>  	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
>  	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
Andrew Morton March 11, 2022, 2:39 a.m. UTC | #3
On Thu, 10 Mar 2022 13:07:49 +0100 Oscar Salvador <osalvador@suse.de> wrote:

> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
> 
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
> 
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.

What I'm not getting here (as so often happens) is a sense of how badly
this affects our users.  Does anyone actually hotplug frequently enough
to care?

If "yes" then I'm inclined to merge this up for 5.18 with a cc:stable. 
Not for 5.17 because it's late and things are looking rather creaky
already.

And I'll add a

Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")

which is that patch's fourth such bouquet.
Huang, Ying March 11, 2022, 5:06 a.m. UTC | #4
Oscar Salvador <osalvador@suse.de> writes:

> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
>
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
>
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
>
> [1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
> [2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
>
> Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
> v1 -> v2: - Fix prototype function declaration
>           - Fix build error on RISC
>           - Adressed feedback from Baolin and Huang
> ---
>  include/linux/migrate.h |  8 ++++++++
>  mm/migrate.c            | 41 +++++------------------------------------
>  mm/vmstat.c             | 13 ++++++++++++-
>  3 files changed, 25 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index db96e10eb8da..90e75d5a54d6 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
>  		struct folio *newfolio, struct folio *folio, int extra_count);
>  
>  extern bool numa_demotion_enabled;
> +extern void migrate_on_reclaim_init(void);
> +#ifdef CONFIG_HOTPLUG_CPU
> +extern void set_migration_target_nodes(void);
>  #else
> +static inline void set_migration_target_nodes(void) {}
> +#endif
> +#else
> +
> +static inline void set_migration_target_nodes(void) {}
>  
>  static inline void putback_movable_pages(struct list_head *l) {}
>  static inline int migrate_pages(struct list_head *l, new_page_t new,
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..f9d5b6092a42 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3190,7 +3190,7 @@ static void __set_migration_target_nodes(void)
>  /*
>   * For callers that do not hold get_online_mems() already.
>   */
> -static void set_migration_target_nodes(void)
> +void set_migration_target_nodes(void)
>  {
>  	get_online_mems();
>  	__set_migration_target_nodes();
> @@ -3254,51 +3254,20 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>  	return notifier_from_errno(0);
>  }
>  
> -/*
> - * React to hotplug events that might affect the migration targets
> - * like events that online or offline NUMA nodes.
> - *
> - * The ordering is also currently dependent on which nodes have
> - * CPUs.  That means we need CPU on/offline notification too.
> - */
> -static int migration_online_cpu(unsigned int cpu)
> +void __init migrate_on_reclaim_init(void)
>  {
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int migration_offline_cpu(unsigned int cpu)
> -{
> -	set_migration_target_nodes();
> -	return 0;
> -}
> -
> -static int __init migrate_on_reclaim_init(void)
> -{
> -	int ret;
> -
>  	node_demotion = kmalloc_array(nr_node_ids,
>  				      sizeof(struct demotion_nodes),
>  				      GFP_KERNEL);
>  	WARN_ON(!node_demotion);
>  
> -	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> -					NULL, migration_offline_cpu);
>  	/*
> -	 * In the unlikely case that this fails, the automatic
> -	 * migration targets may become suboptimal for nodes
> -	 * where N_CPU changes.  With such a small impact in a
> -	 * rare case, do not bother trying to do anything special.
> +	 * At this point, all numa nodes with memory/CPus have their state
> +	 * properly set, so we can build the demotion order now.
>  	 */
> -	WARN_ON(ret < 0);
> -	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> -				migration_online_cpu, NULL);
> -	WARN_ON(ret < 0);
> -
> +	set_migration_target_nodes();

If my understanding were correct, we should enclose
set_migration_target_nodes() here with cpus_read_lock().  And add some
comment before set_migration_target_nodes() for this.  I don't know
whether the locking order is right.

>  	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);

And we should register the notifier before calling set_migration_target_nodes()?

Best Regards,
Huang, Ying

> -	return 0;
>  }
> -late_initcall(migrate_on_reclaim_init);
>  #endif /* CONFIG_HOTPLUG_CPU */
>  

[snip]
Oscar Salvador March 11, 2022, 8:39 a.m. UTC | #5
On Fri, Mar 11, 2022 at 10:24:07AM +0800, Huang, Ying wrote:
> It may be unnecessary to be fixed in this patch.  But I think we need to
> cleanup the kernel config dependencies of the demotion code at some time.

I am glad you brought this up because it is something I have been
thinking about.
I already added it in my to-do list, but I would do it in a separate
patch if you do not mind.

Now, let us try to untangle this mess:

> 1. Even if !defined(CONFIG_HOTPLUG_CPU) &&
>    !defined(CONFIG_MEMORY_HOTPLUG), we still need to allocate
>    "node_demotion" and call set_migration_target_nodes() during boot time.
> 
> 2. If !defined(CONFIG_MEMORY_HOTPLUG), we don't need
>    migrate_on_reclaim_callback().
> 
> 3. We need defined(CONFIG_NUMA) && defined(CONFIG_MIGRATION) for all
>    these code.

Back in the early versions [1] I asked whether we could have some
scenario where this feature could be used when !CONFIG_MEMORY_HOTPLUG
[2].
The reason I was given is that in order to bind the expose PMEM memory
as RAM (add_memory_driver_managed()), we need MEMORY_HOTPLUG.

Now, as I said back then, I am not sure whether PMEM memory can be
exposed as RAM by other means, but as it was pointed out back then,
it really looks like we, at least, need CONFIG_MEMORY_HOTPLUG.

Ok, so we have our first dependency: CONFIG_MEMORY_HOTPLUG.

Now, about CONFIG_HOTPLUG_CPU, it seems that that is not a strong dependency,
as we do not need cpu-hotplug in order to use the feature.

We definitely need CONFIG_MIGRATION and CONFIG_NUMA though.

So, we have something like:

- Depends:
  * CONFIG_NUMA           (obvius)
  * CONFIG_MIGRATION      (to migrate between nodes)
  * CONFIG_MEMORY_HOTPLUG (to expose PMEM as RAM)

Sounds about right?

[1] https://patchwork.kernel.org/project/linux-mm/patch/20210401183221.977831DE@viggo.jf.intel.com/#24099405
[2] https://patchwork.kernel.org/project/linux-mm/patch/20210401183221.977831DE@viggo.jf.intel.com/#24103467
Oscar Salvador March 11, 2022, 9:17 a.m. UTC | #6
On Fri, Mar 11, 2022 at 01:06:26PM +0800, Huang, Ying wrote:
> Oscar Salvador <osalvador@suse.de> writes:
> > -static int __init migrate_on_reclaim_init(void)
> > -{
> > -	int ret;
> > -
> >  	node_demotion = kmalloc_array(nr_node_ids,
> >  				      sizeof(struct demotion_nodes),
> >  				      GFP_KERNEL);
> >  	WARN_ON(!node_demotion);
> >  
> > -	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> > -					NULL, migration_offline_cpu);
> >  	/*
> > -	 * In the unlikely case that this fails, the automatic
> > -	 * migration targets may become suboptimal for nodes
> > -	 * where N_CPU changes.  With such a small impact in a
> > -	 * rare case, do not bother trying to do anything special.
> > +	 * At this point, all numa nodes with memory/CPus have their state
> > +	 * properly set, so we can build the demotion order now.
> >  	 */
> > -	WARN_ON(ret < 0);
> > -	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> > -				migration_online_cpu, NULL);
> > -	WARN_ON(ret < 0);
> > -
> > +	set_migration_target_nodes();
> 
> If my understanding were correct, we should enclose
> set_migration_target_nodes() here with cpus_read_lock().  And add some
> comment before set_migration_target_nodes() for this.  I don't know
> whether the locking order is right.

Oh, I see that cpuhp_setup_state() holds the cpu-hotplug lock while
calling in, so yeah, we might want to hold in there.

The thing is, not long ago we found out that we could have ACPI events
like memory-hotplug operations at boot stage [1], so I guess it is
safe to assume we could also have cpu-hotplug operations at that stage
as well, and so we want to hold cpus_read_lock() just to be on the safe
side.

But, unless I am missing something, that does not apply to
set_migration_target_nodes() being called from a callback,
as the callback (somewhere up the chain) already holds that lock.
e.g: (_cpu_up takes cpus_write_lock()) and the same for the down
operation.

So, to sum it up, we only need the cpus_read_lock() in
migrate_on_reclaim_init().

> >  	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> 
> And we should register the notifier before calling set_migration_target_nodes()?

I cannot made my mind here.
The primary reason I placed the call before registering the notifier is
because the original code called set_migration_target_nodes() before
doing so:

<--
ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
			migration_online_cpu, NULL);
WARN_ON(ret < 0);

hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-->

I thought about following the same line. Why do you think it should be
called afterwards?

I am not really sure whether it has a different impact depending on the
order.
Note that memory-hotplug acpi events can happen at boot time, so by the
time we register the memory_hotplug notifier, we can have some hotplug
memory coming in, and so we call set_migration_target_nodes().

But that is fine, and I cannot see a difference shufling the order
of them. 
Do you see a problem in there?

[1] https://patchwork.kernel.org/project/linux-mm/patch/20200915094143.79181-3-ldufour@linux.ibm.com/
Oscar Salvador March 11, 2022, 9:23 a.m. UTC | #7
On Thu, Mar 10, 2022 at 06:39:51PM -0800, Andrew Morton wrote:
> What I'm not getting here (as so often happens) is a sense of how badly
> this affects our users.  Does anyone actually hotplug frequently enough
> to care?

Well, I would not say it is critical, it just regresses the time
hotplug operations take. How bad is that? I guess it depends.
Memory hotplug operations are already slow per se, so I would say
expectations are not that high.

But it speeds up the process, that is for sure.

e.g: In a system with 144 CPUs and 2 Numa-nodes,
set_migration_target_nodes()
gets called exactly 144 times at boot time(one per every time a CPU is
brought up), where only 2 calls would suffice, so you can get an idea.

> If "yes" then I'm inclined to merge this up for 5.18 with a cc:stable. 
> Not for 5.17 because it's late and things are looking rather creaky
> already.
> 
> And I'll add a
> 
> Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
> 
> which is that patch's fourth such bouquet.

Thanks Andrew!
Dave Hansen March 11, 2022, 5:10 p.m. UTC | #8
On 3/10/22 18:39, Andrew Morton wrote:
> On Thu, 10 Mar 2022 13:07:49 +0100 Oscar Salvador <osalvador@suse.de> wrote:
>> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
>> that check exactly that, so get rid of the CPU callbacks in
>> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
>> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
> What I'm not getting here (as so often happens) is a sense of how badly
> this affects our users.  Does anyone actually hotplug frequently enough
> to care?

I asked Abhishek about this a bit here:

> https://lore.kernel.org/all/4e8067e1-0574-c9d2-9d6c-d676d32071bd@linux.vnet.ibm.com/

It sounded to me like there are ppc users who convert their systems from
SMT=1 to SMT=8.  I'd guess that they want to do this as a side-channel
mitigation because ppc has been dealing with the same basic issues as
those of us over in x86 land.  The increase in time (20s->36s) would be
noticeable and probably slightly annoying to a human waiting on it.

I'd love to hear more details on this from Abhishek, like whether end
users do this as opposed to IBM's kernel developers.  But, it does sound
deserving of a stable@ tag to me.
Huang, Ying March 14, 2022, 1:03 a.m. UTC | #9
Oscar Salvador <osalvador@suse.de> writes:

> On Fri, Mar 11, 2022 at 10:24:07AM +0800, Huang, Ying wrote:
>> It may be unnecessary to be fixed in this patch.  But I think we need to
>> cleanup the kernel config dependencies of the demotion code at some time.
>
> I am glad you brought this up because it is something I have been
> thinking about.
> I already added it in my to-do list, but I would do it in a separate
> patch if you do not mind.

Thanks!

> Now, let us try to untangle this mess:
>
>> 1. Even if !defined(CONFIG_HOTPLUG_CPU) &&
>>    !defined(CONFIG_MEMORY_HOTPLUG), we still need to allocate
>>    "node_demotion" and call set_migration_target_nodes() during boot time.
>> 
>> 2. If !defined(CONFIG_MEMORY_HOTPLUG), we don't need
>>    migrate_on_reclaim_callback().
>> 
>> 3. We need defined(CONFIG_NUMA) && defined(CONFIG_MIGRATION) for all
>>    these code.
>
> Back in the early versions [1] I asked whether we could have some
> scenario where this feature could be used when !CONFIG_MEMORY_HOTPLUG
> [2].
> The reason I was given is that in order to bind the expose PMEM memory
> as RAM (add_memory_driver_managed()), we need MEMORY_HOTPLUG.
>
> Now, as I said back then, I am not sure whether PMEM memory can be
> exposed as RAM by other means, but as it was pointed out back then,
> it really looks like we, at least, need CONFIG_MEMORY_HOTPLUG.
>
> Ok, so we have our first dependency: CONFIG_MEMORY_HOTPLUG.

On host machine, PMEM is always exposed via memory hotplug.  But later
on, we found that for guest system it's possible for PMEM to be exposed
as normal memory.

Best Regards,
Huang, Ying

> Now, about CONFIG_HOTPLUG_CPU, it seems that that is not a strong dependency,
> as we do not need cpu-hotplug in order to use the feature.
>
> We definitely need CONFIG_MIGRATION and CONFIG_NUMA though.
>
> So, we have something like:
>
> - Depends:
>   * CONFIG_NUMA           (obvius)
>   * CONFIG_MIGRATION      (to migrate between nodes)
>   * CONFIG_MEMORY_HOTPLUG (to expose PMEM as RAM)
>
> Sounds about right?
>
> [1] https://patchwork.kernel.org/project/linux-mm/patch/20210401183221.977831DE@viggo.jf.intel.com/#24099405
> [2] https://patchwork.kernel.org/project/linux-mm/patch/20210401183221.977831DE@viggo.jf.intel.com/#24103467
Huang, Ying March 14, 2022, 3:09 a.m. UTC | #10
Oscar Salvador <osalvador@suse.de> writes:

> On Fri, Mar 11, 2022 at 01:06:26PM +0800, Huang, Ying wrote:
>> Oscar Salvador <osalvador@suse.de> writes:
>> > -static int __init migrate_on_reclaim_init(void)
>> > -{
>> > -	int ret;
>> > -
>> >  	node_demotion = kmalloc_array(nr_node_ids,
>> >  				      sizeof(struct demotion_nodes),
>> >  				      GFP_KERNEL);
>> >  	WARN_ON(!node_demotion);
>> >  
>> > -	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
>> > -					NULL, migration_offline_cpu);
>> >  	/*
>> > -	 * In the unlikely case that this fails, the automatic
>> > -	 * migration targets may become suboptimal for nodes
>> > -	 * where N_CPU changes.  With such a small impact in a
>> > -	 * rare case, do not bother trying to do anything special.
>> > +	 * At this point, all numa nodes with memory/CPus have their state
>> > +	 * properly set, so we can build the demotion order now.
>> >  	 */
>> > -	WARN_ON(ret < 0);
>> > -	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
>> > -				migration_online_cpu, NULL);
>> > -	WARN_ON(ret < 0);
>> > -
>> > +	set_migration_target_nodes();
>> 
>> If my understanding were correct, we should enclose
>> set_migration_target_nodes() here with cpus_read_lock().  And add some
>> comment before set_migration_target_nodes() for this.  I don't know
>> whether the locking order is right.
>
> Oh, I see that cpuhp_setup_state() holds the cpu-hotplug lock while
> calling in, so yeah, we might want to hold in there.
>
> The thing is, not long ago we found out that we could have ACPI events
> like memory-hotplug operations at boot stage [1], so I guess it is
> safe to assume we could also have cpu-hotplug operations at that stage
> as well, and so we want to hold cpus_read_lock() just to be on the safe
> side.
>
> But, unless I am missing something, that does not apply to
> set_migration_target_nodes() being called from a callback,
> as the callback (somewhere up the chain) already holds that lock.
> e.g: (_cpu_up takes cpus_write_lock()) and the same for the down
> operation.
>
> So, to sum it up, we only need the cpus_read_lock() in
> migrate_on_reclaim_init().

Yes.  That is what I want to say.  Sorry for confusing.

>> >  	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
>> 
>> And we should register the notifier before calling set_migration_target_nodes()?
>
> I cannot made my mind here.
> The primary reason I placed the call before registering the notifier is
> because the original code called set_migration_target_nodes() before
> doing so:
>
> <--
> ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> 			migration_online_cpu, NULL);
> WARN_ON(ret < 0);
>
> hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> -->
>
> I thought about following the same line. Why do you think it should be
> called afterwards?
>
> I am not really sure whether it has a different impact depending on the
> order.
> Note that memory-hotplug acpi events can happen at boot time, so by the
> time we register the memory_hotplug notifier, we can have some hotplug
> memory coming in, and so we call set_migration_target_nodes().
>
> But that is fine, and I cannot see a difference shufling the order
> of them. 
> Do you see a problem in there?

Per my understanding, the race condition as follows may be possible in
theory,

CPU1                                CPU2
----                                ----
set_migration_target_nodes()
                                <-- // a new node is hotplugged, and missed
hotplug_memory_notifier()

During boot, this may be impossible in practice.  But I still think it's
good to make the order correct in general.  And it's not hard to do that.

Best Regards,
Huang, Ying

> [1] https://patchwork.kernel.org/project/linux-mm/patch/20200915094143.79181-3-ldufour@linux.ibm.com/
Abhishek Goel March 14, 2022, 9:09 a.m. UTC | #11
On 11/03/22 22:40, Dave Hansen wrote:
> On 3/10/22 18:39, Andrew Morton wrote:
>> On Thu, 10 Mar 2022 13:07:49 +0100 Oscar Salvador <osalvador@suse.de> wrote:
>>> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
>>> that check exactly that, so get rid of the CPU callbacks in
>>> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
>>> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
>> What I'm not getting here (as so often happens) is a sense of how badly
>> this affects our users.  Does anyone actually hotplug frequently enough
>> to care?
> I asked Abhishek about this a bit here:
>
>> https://lore.kernel.org/all/4e8067e1-0574-c9d2-9d6c-d676d32071bd@linux.vnet.ibm.com/
> It sounded to me like there are ppc users who convert their systems from
> SMT=1 to SMT=8.  I'd guess that they want to do this as a side-channel
> mitigation because ppc has been dealing with the same basic issues as
> those of us over in x86 land.  The increase in time (20s->36s) would be
> noticeable and probably slightly annoying to a human waiting on it.
>
> I'd love to hear more details on this from Abhishek, like whether end
> users do this as opposed to IBM's kernel developers.  But, it does sound
> deserving of a stable@ tag to me.
Yes, end users also use this, especially on large systems, might want
to switch between SMT=1, SMT=4 and SMT=8.
And this is also usable for dynamic LPAR operations.
As Dave pointed out, this increase in time while manageable and just
noticeable on smaller systems, can be very clearly observed as the
systems become larger.
Oscar Salvador March 14, 2022, 3:13 p.m. UTC | #12
On Mon, Mar 14, 2022 at 09:03:59AM +0800, Huang, Ying wrote:
> Oscar Salvador <osalvador@suse.de> writes:
> On host machine, PMEM is always exposed via memory hotplug.  But later
> on, we found that for guest system it's possible for PMEM to be exposed
> as normal memory.

Could you please elaborate on that? How is it done? I would love to hear the
details.

Thanks
Dave Hansen March 14, 2022, 3:20 p.m. UTC | #13
On 3/14/22 08:13, Oscar Salvador wrote:
> On Mon, Mar 14, 2022 at 09:03:59AM +0800, Huang, Ying wrote:
>> Oscar Salvador <osalvador@suse.de> writes:
>> On host machine, PMEM is always exposed via memory hotplug.  But later
>> on, we found that for guest system it's possible for PMEM to be exposed
>> as normal memory.
> Could you please elaborate on that? How is it done? I would love to hear the
> details.

Qemu, for instance, has a "mem-path" argument.  It's typically used for
using hugetlbfs as guest memory.  But, there's nothing stopping you from
pointing it to a DAX device or a file on a DAX filesystem that's backed
by pmem.
Oscar Salvador March 15, 2022, 6:13 a.m. UTC | #14
On Mon, Mar 14, 2022 at 08:20:57AM -0700, Dave Hansen wrote:
> Qemu, for instance, has a "mem-path" argument.  It's typically used for
> using hugetlbfs as guest memory.  But, there's nothing stopping you from
> pointing it to a DAX device or a file on a DAX filesystem that's backed
> by pmem.

Thanks Dave.

But that is somehow different, is not it?
When you use pmem backed memory as a RAM for the guest, the guest is not
seeing that as PMEM, but just as a normal RAM, right?
IOW, the guest cannot use that memory for demotion, as we can use it in
the host when configured.

I might be missing something, I am using this qemu cmdline:

        $QEMU -enable-kvm -machine pc -smp 4 -cpu host -monitor pty -m 5G \
	-object memory-backend-file,id=pc.ram,size=5G,mem-path=/mnt/pmem,share=off -machine memory-backend=pc.ram \
	$IMAGE -boot c -vnc :0 

(/mnt/pmem was mounted with "mount -o dax /dev/pmem1 /mnt/pmem/")

My point is, if it is really true that the guest cannot use that memory for
demotion, then we would still need CONFIG_MEMORY_HOTPLUG, as that is the
only way to expose PMEM to any system to be used as a demotion option
(via add_memory_driver_managed() through kmem driver).

Or am I missing some qemu magic to use that memory as demotion in the
guest as well?
Huang, Ying March 15, 2022, 6:31 a.m. UTC | #15
Oscar Salvador <osalvador@suse.de> writes:

> On Mon, Mar 14, 2022 at 08:20:57AM -0700, Dave Hansen wrote:
>> Qemu, for instance, has a "mem-path" argument.  It's typically used for
>> using hugetlbfs as guest memory.  But, there's nothing stopping you from
>> pointing it to a DAX device or a file on a DAX filesystem that's backed
>> by pmem.
>
> Thanks Dave.
>
> But that is somehow different, is not it?
> When you use pmem backed memory as a RAM for the guest, the guest is not
> seeing that as PMEM, but just as a normal RAM, right?
> IOW, the guest cannot use that memory for demotion, as we can use it in
> the host when configured.
>
> I might be missing something, I am using this qemu cmdline:
>
>         $QEMU -enable-kvm -machine pc -smp 4 -cpu host -monitor pty -m 5G \
> 	-object memory-backend-file,id=pc.ram,size=5G,mem-path=/mnt/pmem,share=off -machine memory-backend=pc.ram \
> 	$IMAGE -boot c -vnc :0 
>
> (/mnt/pmem was mounted with "mount -o dax /dev/pmem1 /mnt/pmem/")
>
> My point is, if it is really true that the guest cannot use that memory for
> demotion, then we would still need CONFIG_MEMORY_HOTPLUG, as that is the
> only way to expose PMEM to any system to be used as a demotion option
> (via add_memory_driver_managed() through kmem driver).
>
> Or am I missing some qemu magic to use that memory as demotion in the
> guest as well?

You need to put PMEM to a NUMA node to use demotion, as follows,

qemu-system-x86_64 --enable-kvm \
-M pc,accel=kvm,nvdimm=on -smp 8 -m 160G,slots=18,maxmem=703G \
-object memory-backend-ram,id=mem0,size=32G \
-object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=128G,align=2M \
-numa node,memdev=mem0,cpus=0-7,nodeid=0 \
-numa node,memdev=mem1,nodeid=1 \
$IMAGE

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da..90e75d5a54d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -48,7 +48,15 @@  int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);
 
 extern bool numa_demotion_enabled;
+extern void migrate_on_reclaim_init(void);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void set_migration_target_nodes(void);
 #else
+static inline void set_migration_target_nodes(void) {}
+#endif
+#else
+
+static inline void set_migration_target_nodes(void) {}
 
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t new,
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..f9d5b6092a42 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3190,7 +3190,7 @@  static void __set_migration_target_nodes(void)
 /*
  * For callers that do not hold get_online_mems() already.
  */
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
 {
 	get_online_mems();
 	__set_migration_target_nodes();
@@ -3254,51 +3254,20 @@  static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
 	return notifier_from_errno(0);
 }
 
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs.  That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
+void __init migrate_on_reclaim_init(void)
 {
-	set_migration_target_nodes();
-	return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
-{
-	set_migration_target_nodes();
-	return 0;
-}
-
-static int __init migrate_on_reclaim_init(void)
-{
-	int ret;
-
 	node_demotion = kmalloc_array(nr_node_ids,
 				      sizeof(struct demotion_nodes),
 				      GFP_KERNEL);
 	WARN_ON(!node_demotion);
 
-	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
-					NULL, migration_offline_cpu);
 	/*
-	 * In the unlikely case that this fails, the automatic
-	 * migration targets may become suboptimal for nodes
-	 * where N_CPU changes.  With such a small impact in a
-	 * rare case, do not bother trying to do anything special.
+	 * At this point, all numa nodes with memory/CPus have their state
+	 * properly set, so we can build the demotion order now.
 	 */
-	WARN_ON(ret < 0);
-	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
-				migration_online_cpu, NULL);
-	WARN_ON(ret < 0);
-
+	set_migration_target_nodes();
 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-	return 0;
 }
-late_initcall(migrate_on_reclaim_init);
 #endif /* CONFIG_HOTPLUG_CPU */
 
 bool numa_demotion_enabled = false;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4057372745d0..9e9536df51b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@ 
 #include <linux/mm_inline.h>
 #include <linux/page_ext.h>
 #include <linux/page_owner.h>
+#include <linux/migrate.h>
 
 #include "internal.h"
 
@@ -2043,7 +2044,12 @@  static void __init init_cpu_node_state(void)
 static int vmstat_cpu_online(unsigned int cpu)
 {
 	refresh_zone_stat_thresholds();
-	node_set_state(cpu_to_node(cpu), N_CPU);
+
+	if (!node_state(cpu_to_node(cpu), N_CPU)) {
+		node_set_state(cpu_to_node(cpu), N_CPU);
+		set_migration_target_nodes();
+	}
+
 	return 0;
 }
 
@@ -2066,6 +2072,8 @@  static int vmstat_cpu_dead(unsigned int cpu)
 		return 0;
 
 	node_clear_state(node, N_CPU);
+	set_migration_target_nodes();
+
 	return 0;
 }
 
@@ -2097,6 +2105,9 @@  void __init init_mm_internals(void)
 
 	start_shepherd_timer();
 #endif
+#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
+	migrate_on_reclaim_init();
+#endif
 #ifdef CONFIG_PROC_FS
 	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
 	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);