diff mbox series

[v2,1/3] mm,slub: Do not special case N_NORMAL nodes for slab_nodes

Message ID 20250408084153.255762-2-osalvador@suse.de (mailing list archive)
State New
Headers show
Series Implement numa node notifier | expand

Commit Message

Oscar Salvador April 8, 2025, 8:41 a.m. UTC
Currently, slab_mem_going_going_callback() checks whether the node has
N_NORMAL memory in order to be set in slab_nodes.
While it is true that gettind rid of that enforcing would mean
ending up with movables nodes in slab_nodes, the memory waste that comes
with that is negligible.

So stop checking for status_change_nid_normal and just use status_change_nid
instead which works for both types of memory.

Also, once we allocate the kmem_cache_node cache  for the node in
slab_mem_online_callback(), we never deallocate it in
slab_mem_off_callback() when the node goes memoryless, so we can just
get rid of it.

The only side effect is that we will stop clearing the node from slab_nodes.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 mm/slub.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

Comments

David Hildenbrand April 8, 2025, 10:17 a.m. UTC | #1
On 08.04.25 10:41, Oscar Salvador wrote:
> Currently, slab_mem_going_going_callback() checks whether the node has
> N_NORMAL memory in order to be set in slab_nodes.
> While it is true that gettind rid of that enforcing would mean
> ending up with movables nodes in slab_nodes, the memory waste that comes
> with that is negligible.
> 
> So stop checking for status_change_nid_normal and just use status_change_nid
> instead which works for both types of memory.
> 
> Also, once we allocate the kmem_cache_node cache  for the node in
> slab_mem_online_callback(), we never deallocate it in
> slab_mem_off_callback() when the node goes memoryless, so we can just
> get rid of it.
> 
> The only side effect is that we will stop clearing the node from slab_nodes.
> 

Feel free to add a Suggested-by: if you think it applies.


Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
would have to be a N_MEMORY check.


But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?

 From 518a2b83a9c5bd85d74ddabbc36ce5d181a88ed6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 8 Apr 2025 12:16:13 +0200
Subject: [PATCH] tmp

Signed-off-by: David Hildenbrand <david@redhat.com>
---
  mm/slub.c | 56 ++++---------------------------------------------------
  1 file changed, 4 insertions(+), 52 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71d..afe31149e7f4e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -445,14 +445,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
  	for (__node = 0; __node < nr_node_ids; __node++) \
  		 if ((__n = get_node(__s, __node)))
  
-/*
- * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
- * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
- * differ during memory hotplug/hotremove operations.
- * Protected by slab_mutex.
- */
-static nodemask_t slab_nodes;
-
  #ifndef CONFIG_SLUB_TINY
  /*
   * Workqueue used for flush_cpu_slab().
@@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  	if (!slab) {
  		/*
  		 * if the node is not online or has no normal memory, just
-		 * ignore the node constraint
+		 * ignore the node constraint.
  		 */
-		if (unlikely(node != NUMA_NO_NODE &&
-			     !node_isset(node, slab_nodes)))
+		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
  			node = NUMA_NO_NODE;
  		goto new_slab;
  	}
@@ -3719,7 +3710,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  		 * same as above but node_match() being false already
  		 * implies node != NUMA_NO_NODE
  		 */
-		if (!node_isset(node, slab_nodes)) {
+		if (!node_state(node, N_NORMAL_MEMORY)) {
  			node = NUMA_NO_NODE;
  		} else {
  			stat(s, ALLOC_NODE_MISMATCH);
@@ -5623,7 +5614,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
  {
  	int node;
  
-	for_each_node_mask(node, slab_nodes) {
+	for_each_node_state(node, N_NORMAL_MEMORY) {
  		struct kmem_cache_node *n;
  
  		if (slab_state == DOWN) {
@@ -6164,30 +6155,6 @@ static int slab_mem_going_offline_callback(void *arg)
  	return 0;
  }
  
-static void slab_mem_offline_callback(void *arg)
-{
-	struct memory_notify *marg = arg;
-	int offline_node;
-
-	offline_node = marg->status_change_nid_normal;
-
-	/*
-	 * If the node still has available memory. we need kmem_cache_node
-	 * for it yet.
-	 */
-	if (offline_node < 0)
-		return;
-
-	mutex_lock(&slab_mutex);
-	node_clear(offline_node, slab_nodes);
-	/*
-	 * We no longer free kmem_cache_node structures here, as it would be
-	 * racy with all get_node() users, and infeasible to protect them with
-	 * slab_mutex.
-	 */
-	mutex_unlock(&slab_mutex);
-}
-
  static int slab_mem_going_online_callback(void *arg)
  {
  	struct kmem_cache_node *n;
@@ -6229,11 +6196,6 @@ static int slab_mem_going_online_callback(void *arg)
  		init_kmem_cache_node(n);
  		s->node[nid] = n;
  	}
-	/*
-	 * Any cache created after this point will also have kmem_cache_node
-	 * initialized for the new node.
-	 */
-	node_set(nid, slab_nodes);
  out:
  	mutex_unlock(&slab_mutex);
  	return ret;
@@ -6253,8 +6215,6 @@ static int slab_memory_callback(struct notifier_block *self,
  		break;
  	case MEM_OFFLINE:
  	case MEM_CANCEL_ONLINE:
-		slab_mem_offline_callback(arg);
-		break;
  	case MEM_ONLINE:
  	case MEM_CANCEL_OFFLINE:
  		break;
@@ -6309,7 +6269,6 @@ void __init kmem_cache_init(void)
  {
  	static __initdata struct kmem_cache boot_kmem_cache,
  		boot_kmem_cache_node;
-	int node;
  
  	if (debug_guardpage_minorder())
  		slub_max_order = 0;
@@ -6321,13 +6280,6 @@ void __init kmem_cache_init(void)
  	kmem_cache_node = &boot_kmem_cache_node;
  	kmem_cache = &boot_kmem_cache;
  
-	/*
-	 * Initialize the nodemask for which we will allocate per node
-	 * structures. Here we don't need taking slab_mutex yet.
-	 */
-	for_each_node_state(node, N_NORMAL_MEMORY)
-		node_set(node, slab_nodes);
-
  	create_boot_cache(kmem_cache_node, "kmem_cache_node",
  			sizeof(struct kmem_cache_node),
  			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
Oscar Salvador April 8, 2025, 12:49 p.m. UTC | #2
On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
> Feel free to add a Suggested-by: if you think it applies.

Sorry David, my bad, totally missed it.
I shall add it.

> Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
> would have to be a N_MEMORY check.

Yes, should be N_MEMORY.

> But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?

I glanced over it and I did not see anything wrong with it, just a
question below.

So, if Vlastimil and Harry think this is fine, we can indeed do this.
If so, I would combine this and the #1 first of this series and add
your Signed-off-by as co-autor. Is that fine by you?


> @@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	if (!slab) {
>  		/*
>  		 * if the node is not online or has no normal memory, just
> -		 * ignore the node constraint
> +		 * ignore the node constraint.
>  		 */
> -		if (unlikely(node != NUMA_NO_NODE &&
> -			     !node_isset(node, slab_nodes)))
> +		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
>  			node = NUMA_NO_NODE;

After my first patch, slab_nodes will also contain N_MEMORY nodes, which
makes me think whether that check should be N_MEMORY?
Harry Yoo April 8, 2025, 2:18 p.m. UTC | #3
On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
> On 08.04.25 10:41, Oscar Salvador wrote:
> > Currently, slab_mem_going_going_callback() checks whether the node has
> > N_NORMAL memory in order to be set in slab_nodes.
> > While it is true that gettind rid of that enforcing would mean
> > ending up with movables nodes in slab_nodes, the memory waste that comes
> > with that is negligible.
> > 
> > So stop checking for status_change_nid_normal and just use status_change_nid
> > instead which works for both types of memory.
> > 
> > Also, once we allocate the kmem_cache_node cache  for the node in
> > slab_mem_online_callback(), we never deallocate it in
> > slab_mem_off_callback() when the node goes memoryless, so we can just
> > get rid of it.
> > 
> > The only side effect is that we will stop clearing the node from slab_nodes.
> > 
> 
> Feel free to add a Suggested-by: if you think it applies.
> 
> 
> Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
> would have to be a N_MEMORY check.
> 
> 
> But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?

The following commit says that SLUB has slab_nodes thingy for a reason...
kmem_cache_node might not be ready yet even when N_NORMAL_MEMORY check
says it now has normal memory.

@Vlastimil maybe a dumb question but why not check s->node[nid]
instead of having slab_nodes bitmask?

commit 7e1fa93deff44677a94dfc323ff629bbf5cf9360
Author: Vlastimil Babka <vbabka@suse.cz>
Date:   Wed Feb 24 12:01:12 2021 -0800

    mm, slab, slub: stop taking memory hotplug lock
    
    Since commit 03afc0e25f7f ("slab: get_online_mems for
    kmem_cache_{create,destroy,shrink}") we are taking memory hotplug lock for
    SLAB and SLUB when creating, destroying or shrinking a cache.  It is quite
    a heavy lock and it's best to avoid it if possible, as we had several
    issues with lockdep complaining about ordering in the past, see e.g.
    e4f8e513c3d3 ("mm/slub: fix a deadlock in show_slab_objects()").
    
    The problem scenario in 03afc0e25f7f (solved by the memory hotplug lock)
    can be summarized as follows: while there's slab_mutex synchronizing new
    kmem cache creation and SLUB's MEM_GOING_ONLINE callback
    slab_mem_going_online_callback(), we may miss creation of kmem_cache_node
    for the hotplugged node in the new kmem cache, because the hotplug
    callback doesn't yet see the new cache, and cache creation in
    init_kmem_cache_nodes() only inits kmem_cache_node for nodes in the
    N_NORMAL_MEMORY nodemask, which however may not yet include the new node,
    as that happens only later after the MEM_GOING_ONLINE callback.
    
    Instead of using get/put_online_mems(), the problem can be solved by SLUB
    maintaining its own nodemask of nodes for which it has allocated the
    per-node kmem_cache_node structures.  This nodemask would generally mirror
    the N_NORMAL_MEMORY nodemask, but would be updated only in under SLUB's
    control in its memory hotplug callbacks under the slab_mutex.  This patch
    adds such nodemask and its handling.
    
    Commit 03afc0e25f7f mentiones "issues like [the one above]", but there
    don't appear to be further issues.  All the paths (shared for SLAB and
    SLUB) taking the memory hotplug locks are also taking the slab_mutex,
    except kmem_cache_shrink() where 03afc0e25f7f replaced slab_mutex with
    get/put_online_mems().
    
    We however cannot simply restore slab_mutex in kmem_cache_shrink(), as
    SLUB can enters the function from a write to sysfs 'shrink' file, thus
    holding kernfs lock, and in kmem_cache_create() the kernfs lock is nested
    within slab_mutex.  But on closer inspection we don't actually need to
    protect kmem_cache_shrink() from hotplug callbacks: While SLUB's
    __kmem_cache_shrink() does for_each_kmem_cache_node(), missing a new node
    added in parallel hotplug is not fatal, and parallel hotremove does not
    free kmem_cache_node's anymore after the previous patch, so use-after free
    cannot happen.  The per-node shrinking itself is protected by
    n->list_lock.  Same is true for SLAB, and SLOB is no-op.
    
    SLAB also doesn't need the memory hotplug locking, which it only gained by
    03afc0e25f7f through the shared paths in slab_common.c.  Its memory
    hotplug callbacks are also protected by slab_mutex against races with
    these paths.  The problem of SLUB relying on N_NORMAL_MEMORY doesn't apply
    to SLAB, as its setup_kmem_cache_nodes relies on N_ONLINE, and the new
    node is already set there during the MEM_GOING_ONLINE callback, so no
    special care is needed for SLAB.
    
    As such, this patch removes all get/put_online_mems() usage by the slab
    subsystem.
    
    Link: https://lkml.kernel.org/r/20210113131634.3671-3-vbabka@suse.cz
    Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
    Cc: Christoph Lameter <cl@linux.com>
    Cc: David Hildenbrand <david@redhat.com>
    Cc: David Rientjes <rientjes@google.com>
    Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
    Cc: Michal Hocko <mhocko@kernel.org>
    Cc: Pekka Enberg <penberg@kernel.org>
    Cc: Qian Cai <cai@redhat.com>
    Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

> 
> From 518a2b83a9c5bd85d74ddabbc36ce5d181a88ed6 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Tue, 8 Apr 2025 12:16:13 +0200
> Subject: [PATCH] tmp
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  mm/slub.c | 56 ++++---------------------------------------------------
>  1 file changed, 4 insertions(+), 52 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index b46f87662e71d..afe31149e7f4e 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -445,14 +445,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
>  	for (__node = 0; __node < nr_node_ids; __node++) \
>  		 if ((__n = get_node(__s, __node)))
> -/*
> - * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
> - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
> - * differ during memory hotplug/hotremove operations.
> - * Protected by slab_mutex.
> - */
> -static nodemask_t slab_nodes;
> -
>  #ifndef CONFIG_SLUB_TINY
>  /*
>   * Workqueue used for flush_cpu_slab().
> @@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	if (!slab) {
>  		/*
>  		 * if the node is not online or has no normal memory, just
> -		 * ignore the node constraint
> +		 * ignore the node constraint.
>  		 */
> -		if (unlikely(node != NUMA_NO_NODE &&
> -			     !node_isset(node, slab_nodes)))
> +		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
>  			node = NUMA_NO_NODE;
>  		goto new_slab;
>  	}
> @@ -3719,7 +3710,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		 * same as above but node_match() being false already
>  		 * implies node != NUMA_NO_NODE
>  		 */
> -		if (!node_isset(node, slab_nodes)) {
> +		if (!node_state(node, N_NORMAL_MEMORY)) {
>  			node = NUMA_NO_NODE;
>  		} else {
>  			stat(s, ALLOC_NODE_MISMATCH);
> @@ -5623,7 +5614,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
>  {
>  	int node;
> -	for_each_node_mask(node, slab_nodes) {
> +	for_each_node_state(node, N_NORMAL_MEMORY) {
>  		struct kmem_cache_node *n;
>  		if (slab_state == DOWN) {
> @@ -6164,30 +6155,6 @@ static int slab_mem_going_offline_callback(void *arg)
>  	return 0;
>  }
> -static void slab_mem_offline_callback(void *arg)
> -{
> -	struct memory_notify *marg = arg;
> -	int offline_node;
> -
> -	offline_node = marg->status_change_nid_normal;
> -
> -	/*
> -	 * If the node still has available memory. we need kmem_cache_node
> -	 * for it yet.
> -	 */
> -	if (offline_node < 0)
> -		return;
> -
> -	mutex_lock(&slab_mutex);
> -	node_clear(offline_node, slab_nodes);
> -	/*
> -	 * We no longer free kmem_cache_node structures here, as it would be
> -	 * racy with all get_node() users, and infeasible to protect them with
> -	 * slab_mutex.
> -	 */
> -	mutex_unlock(&slab_mutex);
> -}
> -
>  static int slab_mem_going_online_callback(void *arg)
>  {
>  	struct kmem_cache_node *n;
> @@ -6229,11 +6196,6 @@ static int slab_mem_going_online_callback(void *arg)
>  		init_kmem_cache_node(n);
>  		s->node[nid] = n;
>  	}
> -	/*
> -	 * Any cache created after this point will also have kmem_cache_node
> -	 * initialized for the new node.
> -	 */
> -	node_set(nid, slab_nodes);
>  out:
>  	mutex_unlock(&slab_mutex);
>  	return ret;
> @@ -6253,8 +6215,6 @@ static int slab_memory_callback(struct notifier_block *self,
>  		break;
>  	case MEM_OFFLINE:
>  	case MEM_CANCEL_ONLINE:
> -		slab_mem_offline_callback(arg);
> -		break;
>  	case MEM_ONLINE:
>  	case MEM_CANCEL_OFFLINE:
>  		break;
> @@ -6309,7 +6269,6 @@ void __init kmem_cache_init(void)
>  {
>  	static __initdata struct kmem_cache boot_kmem_cache,
>  		boot_kmem_cache_node;
> -	int node;
>  	if (debug_guardpage_minorder())
>  		slub_max_order = 0;
> @@ -6321,13 +6280,6 @@ void __init kmem_cache_init(void)
>  	kmem_cache_node = &boot_kmem_cache_node;
>  	kmem_cache = &boot_kmem_cache;
> -	/*
> -	 * Initialize the nodemask for which we will allocate per node
> -	 * structures. Here we don't need taking slab_mutex yet.
> -	 */
> -	for_each_node_state(node, N_NORMAL_MEMORY)
> -		node_set(node, slab_nodes);
> -
>  	create_boot_cache(kmem_cache_node, "kmem_cache_node",
>  			sizeof(struct kmem_cache_node),
>  			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
> -- 
> 2.48.1
> 
> 
> Not sure if there are any races to consider ... just an idea.
> 
> -- 
> Cheers,
> 
> David / dhildenb
>
David Hildenbrand April 8, 2025, 2:25 p.m. UTC | #4
On 08.04.25 16:18, Harry Yoo wrote:
> On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
>> On 08.04.25 10:41, Oscar Salvador wrote:
>>> Currently, slab_mem_going_going_callback() checks whether the node has
>>> N_NORMAL memory in order to be set in slab_nodes.
>>> While it is true that gettind rid of that enforcing would mean
>>> ending up with movables nodes in slab_nodes, the memory waste that comes
>>> with that is negligible.
>>>
>>> So stop checking for status_change_nid_normal and just use status_change_nid
>>> instead which works for both types of memory.
>>>
>>> Also, once we allocate the kmem_cache_node cache  for the node in
>>> slab_mem_online_callback(), we never deallocate it in
>>> slab_mem_off_callback() when the node goes memoryless, so we can just
>>> get rid of it.
>>>
>>> The only side effect is that we will stop clearing the node from slab_nodes.
>>>
>>
>> Feel free to add a Suggested-by: if you think it applies.
>>
>>
>> Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
>> would have to be a N_MEMORY check.
>>
>>
>> But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?
> 
> The following commit says that SLUB has slab_nodes thingy for a reason...
> kmem_cache_node might not be ready yet even when N_NORMAL_MEMORY check
> says it now has normal memory.

node_states_set_node() is called from memory hotplug code after 
MEM_GOING_ONLINE and after online_pages_range().

Pages might be isolated at that point, but node_states_set_node() is set 
only after the memory notifier (MEM_GOING_ONLINE) was triggered.

So I don't immediately see the problem assuming that we never free the 
structures.

But yeah, this is what I raised below: "Not sure if there are any races 
to consider" :)

> 
> @Vlastimil maybe a dumb question but why not check s->node[nid]
> instead of having slab_nodes bitmask?
> 
> commit 7e1fa93deff44677a94dfc323ff629bbf5cf9360
> Author: Vlastimil Babka <vbabka@suse.cz>
> Date:   Wed Feb 24 12:01:12 2021 -0800
> 
>      mm, slab, slub: stop taking memory hotplug lock
>      
>      Since commit 03afc0e25f7f ("slab: get_online_mems for
>      kmem_cache_{create,destroy,shrink}") we are taking memory hotplug lock for
>      SLAB and SLUB when creating, destroying or shrinking a cache.  It is quite
>      a heavy lock and it's best to avoid it if possible, as we had several
>      issues with lockdep complaining about ordering in the past, see e.g.
>      e4f8e513c3d3 ("mm/slub: fix a deadlock in show_slab_objects()").
>      
>      The problem scenario in 03afc0e25f7f (solved by the memory hotplug lock)
>      can be summarized as follows: while there's slab_mutex synchronizing new
>      kmem cache creation and SLUB's MEM_GOING_ONLINE callback
>      slab_mem_going_online_callback(), we may miss creation of kmem_cache_node
>      for the hotplugged node in the new kmem cache, because the hotplug
>      callback doesn't yet see the new cache, and cache creation in
>      init_kmem_cache_nodes() only inits kmem_cache_node for nodes in the
>      N_NORMAL_MEMORY nodemask, which however may not yet include the new node,
>      as that happens only later after the MEM_GOING_ONLINE callback.
>      
>      Instead of using get/put_online_mems(), the problem can be solved by SLUB
>      maintaining its own nodemask of nodes for which it has allocated the
>      per-node kmem_cache_node structures.  This nodemask would generally mirror
>      the N_NORMAL_MEMORY nodemask, but would be updated only in under SLUB's
>      control in its memory hotplug callbacks under the slab_mutex.  This patch
>      adds such nodemask and its handling.
>      
>      Commit 03afc0e25f7f mentiones "issues like [the one above]", but there
>      don't appear to be further issues.  All the paths (shared for SLAB and
>      SLUB) taking the memory hotplug locks are also taking the slab_mutex,
>      except kmem_cache_shrink() where 03afc0e25f7f replaced slab_mutex with
>      get/put_online_mems().
>      
>      We however cannot simply restore slab_mutex in kmem_cache_shrink(), as
>      SLUB can enters the function from a write to sysfs 'shrink' file, thus
>      holding kernfs lock, and in kmem_cache_create() the kernfs lock is nested
>      within slab_mutex.  But on closer inspection we don't actually need to
>      protect kmem_cache_shrink() from hotplug callbacks: While SLUB's
>      __kmem_cache_shrink() does for_each_kmem_cache_node(), missing a new node
>      added in parallel hotplug is not fatal, and parallel hotremove does not
>      free kmem_cache_node's anymore after the previous patch, so use-after free
>      cannot happen.  The per-node shrinking itself is protected by
>      n->list_lock.  Same is true for SLAB, and SLOB is no-op.
>      
>      SLAB also doesn't need the memory hotplug locking, which it only gained by
>      03afc0e25f7f through the shared paths in slab_common.c.  Its memory
>      hotplug callbacks are also protected by slab_mutex against races with
>      these paths.  The problem of SLUB relying on N_NORMAL_MEMORY doesn't apply
>      to SLAB, as its setup_kmem_cache_nodes relies on N_ONLINE, and the new
>      node is already set there during the MEM_GOING_ONLINE callback, so no
>      special care is needed for SLAB.
>      
>      As such, this patch removes all get/put_online_mems() usage by the slab
>      subsystem.
>      
>      Link: https://lkml.kernel.org/r/20210113131634.3671-3-vbabka@suse.cz
>      Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>      Cc: Christoph Lameter <cl@linux.com>
>      Cc: David Hildenbrand <david@redhat.com>
>      Cc: David Rientjes <rientjes@google.com>
>      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>      Cc: Michal Hocko <mhocko@kernel.org>
>      Cc: Pekka Enberg <penberg@kernel.org>
>      Cc: Qian Cai <cai@redhat.com>
>      Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
>      Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>      Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> 
>>
>>  From 518a2b83a9c5bd85d74ddabbc36ce5d181a88ed6 Mon Sep 17 00:00:00 2001
>> From: David Hildenbrand <david@redhat.com>
>> Date: Tue, 8 Apr 2025 12:16:13 +0200
>> Subject: [PATCH] tmp
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>   mm/slub.c | 56 ++++---------------------------------------------------
>>   1 file changed, 4 insertions(+), 52 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index b46f87662e71d..afe31149e7f4e 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -445,14 +445,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
>>   	for (__node = 0; __node < nr_node_ids; __node++) \
>>   		 if ((__n = get_node(__s, __node)))
>> -/*
>> - * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
>> - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
>> - * differ during memory hotplug/hotremove operations.
>> - * Protected by slab_mutex.
>> - */
>> -static nodemask_t slab_nodes;
>> -
>>   #ifndef CONFIG_SLUB_TINY
>>   /*
>>    * Workqueue used for flush_cpu_slab().
>> @@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>   	if (!slab) {
>>   		/*
>>   		 * if the node is not online or has no normal memory, just
>> -		 * ignore the node constraint
>> +		 * ignore the node constraint.
>>   		 */
>> -		if (unlikely(node != NUMA_NO_NODE &&
>> -			     !node_isset(node, slab_nodes)))
>> +		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
>>   			node = NUMA_NO_NODE;
>>   		goto new_slab;
>>   	}
>> @@ -3719,7 +3710,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>   		 * same as above but node_match() being false already
>>   		 * implies node != NUMA_NO_NODE
>>   		 */
>> -		if (!node_isset(node, slab_nodes)) {
>> +		if (!node_state(node, N_NORMAL_MEMORY)) {
>>   			node = NUMA_NO_NODE;
>>   		} else {
>>   			stat(s, ALLOC_NODE_MISMATCH);
>> @@ -5623,7 +5614,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
>>   {
>>   	int node;
>> -	for_each_node_mask(node, slab_nodes) {
>> +	for_each_node_state(node, N_NORMAL_MEMORY) {
>>   		struct kmem_cache_node *n;
>>   		if (slab_state == DOWN) {
>> @@ -6164,30 +6155,6 @@ static int slab_mem_going_offline_callback(void *arg)
>>   	return 0;
>>   }
>> -static void slab_mem_offline_callback(void *arg)
>> -{
>> -	struct memory_notify *marg = arg;
>> -	int offline_node;
>> -
>> -	offline_node = marg->status_change_nid_normal;
>> -
>> -	/*
>> -	 * If the node still has available memory. we need kmem_cache_node
>> -	 * for it yet.
>> -	 */
>> -	if (offline_node < 0)
>> -		return;
>> -
>> -	mutex_lock(&slab_mutex);
>> -	node_clear(offline_node, slab_nodes);
>> -	/*
>> -	 * We no longer free kmem_cache_node structures here, as it would be
>> -	 * racy with all get_node() users, and infeasible to protect them with
>> -	 * slab_mutex.
>> -	 */
>> -	mutex_unlock(&slab_mutex);
>> -}
>> -
>>   static int slab_mem_going_online_callback(void *arg)
>>   {
>>   	struct kmem_cache_node *n;
>> @@ -6229,11 +6196,6 @@ static int slab_mem_going_online_callback(void *arg)
>>   		init_kmem_cache_node(n);
>>   		s->node[nid] = n;
>>   	}
>> -	/*
>> -	 * Any cache created after this point will also have kmem_cache_node
>> -	 * initialized for the new node.
>> -	 */
>> -	node_set(nid, slab_nodes);
>>   out:
>>   	mutex_unlock(&slab_mutex);
>>   	return ret;
>> @@ -6253,8 +6215,6 @@ static int slab_memory_callback(struct notifier_block *self,
>>   		break;
>>   	case MEM_OFFLINE:
>>   	case MEM_CANCEL_ONLINE:
>> -		slab_mem_offline_callback(arg);
>> -		break;
>>   	case MEM_ONLINE:
>>   	case MEM_CANCEL_OFFLINE:
>>   		break;
>> @@ -6309,7 +6269,6 @@ void __init kmem_cache_init(void)
>>   {
>>   	static __initdata struct kmem_cache boot_kmem_cache,
>>   		boot_kmem_cache_node;
>> -	int node;
>>   	if (debug_guardpage_minorder())
>>   		slub_max_order = 0;
>> @@ -6321,13 +6280,6 @@ void __init kmem_cache_init(void)
>>   	kmem_cache_node = &boot_kmem_cache_node;
>>   	kmem_cache = &boot_kmem_cache;
>> -	/*
>> -	 * Initialize the nodemask for which we will allocate per node
>> -	 * structures. Here we don't need taking slab_mutex yet.
>> -	 */
>> -	for_each_node_state(node, N_NORMAL_MEMORY)
>> -		node_set(node, slab_nodes);
>> -
>>   	create_boot_cache(kmem_cache_node, "kmem_cache_node",
>>   			sizeof(struct kmem_cache_node),
>>   			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
>> -- 
>> 2.48.1
>>
>>
>> Not sure if there are any races to consider ... just an idea.
>>
>> -- 
>> Cheers,
>>
>> David / dhildenb
>>
>
Harry Yoo April 8, 2025, 2:54 p.m. UTC | #5
On Tue, Apr 08, 2025 at 04:25:33PM +0200, David Hildenbrand wrote:
> On 08.04.25 16:18, Harry Yoo wrote:
> > On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
> > > On 08.04.25 10:41, Oscar Salvador wrote:
> > > > Currently, slab_mem_going_going_callback() checks whether the node has
> > > > N_NORMAL memory in order to be set in slab_nodes.
> > > > While it is true that gettind rid of that enforcing would mean
> > > > ending up with movables nodes in slab_nodes, the memory waste that comes
> > > > with that is negligible.
> > > > 
> > > > So stop checking for status_change_nid_normal and just use status_change_nid
> > > > instead which works for both types of memory.
> > > > 
> > > > Also, once we allocate the kmem_cache_node cache  for the node in
> > > > slab_mem_online_callback(), we never deallocate it in
> > > > slab_mem_off_callback() when the node goes memoryless, so we can just
> > > > get rid of it.
> > > > 
> > > > The only side effect is that we will stop clearing the node from slab_nodes.
> > > > 
> > > 
> > > Feel free to add a Suggested-by: if you think it applies.
> > > 
> > > 
> > > Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
> > > would have to be a N_MEMORY check.
> > > 
> > > 
> > > But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?
> > 
> > The following commit says that SLUB has slab_nodes thingy for a reason...
> > kmem_cache_node might not be ready yet even when N_NORMAL_MEMORY check
> > says it now has normal memory.
> 
> node_states_set_node() is called from memory hotplug code after
> MEM_GOING_ONLINE and after online_pages_range().
> 
> Pages might be isolated at that point, but node_states_set_node() is set
> only after the memory notifier (MEM_GOING_ONLINE) was triggered.

Oh right, didn't realize that. Thanks.

> So I don't immediately see the problem assuming that we never free the
> structures.
> 
> But yeah, this is what I raised below: "Not sure if there are any races to
> consider" :)

At least on the slab side I don't see any races that need to be
addressed. Hopefully I didn't overlook anything :)

> > @Vlastimil maybe a dumb question but why not check s->node[nid]
> > instead of having slab_nodes bitmask?
> > 
> > commit 7e1fa93deff44677a94dfc323ff629bbf5cf9360
> > Author: Vlastimil Babka <vbabka@suse.cz>
> > Date:   Wed Feb 24 12:01:12 2021 -0800
> > 
> >      mm, slab, slub: stop taking memory hotplug lock
> >      Since commit 03afc0e25f7f ("slab: get_online_mems for
> >      kmem_cache_{create,destroy,shrink}") we are taking memory hotplug lock for
> >      SLAB and SLUB when creating, destroying or shrinking a cache.  It is quite
> >      a heavy lock and it's best to avoid it if possible, as we had several
> >      issues with lockdep complaining about ordering in the past, see e.g.
> >      e4f8e513c3d3 ("mm/slub: fix a deadlock in show_slab_objects()").
> >      The problem scenario in 03afc0e25f7f (solved by the memory hotplug lock)
> >      can be summarized as follows: while there's slab_mutex synchronizing new
> >      kmem cache creation and SLUB's MEM_GOING_ONLINE callback
> >      slab_mem_going_online_callback(), we may miss creation of kmem_cache_node
> >      for the hotplugged node in the new kmem cache, because the hotplug
> >      callback doesn't yet see the new cache, and cache creation in
> >      init_kmem_cache_nodes() only inits kmem_cache_node for nodes in the
> >      N_NORMAL_MEMORY nodemask, which however may not yet include the new node,
> >      as that happens only later after the MEM_GOING_ONLINE callback.
> >      Instead of using get/put_online_mems(), the problem can be solved by SLUB
> >      maintaining its own nodemask of nodes for which it has allocated the
> >      per-node kmem_cache_node structures.  This nodemask would generally mirror
> >      the N_NORMAL_MEMORY nodemask, but would be updated only in under SLUB's
> >      control in its memory hotplug callbacks under the slab_mutex.  This patch
> >      adds such nodemask and its handling.
> >      Commit 03afc0e25f7f mentiones "issues like [the one above]", but there
> >      don't appear to be further issues.  All the paths (shared for SLAB and
> >      SLUB) taking the memory hotplug locks are also taking the slab_mutex,
> >      except kmem_cache_shrink() where 03afc0e25f7f replaced slab_mutex with
> >      get/put_online_mems().
> >      We however cannot simply restore slab_mutex in kmem_cache_shrink(), as
> >      SLUB can enters the function from a write to sysfs 'shrink' file, thus
> >      holding kernfs lock, and in kmem_cache_create() the kernfs lock is nested
> >      within slab_mutex.  But on closer inspection we don't actually need to
> >      protect kmem_cache_shrink() from hotplug callbacks: While SLUB's
> >      __kmem_cache_shrink() does for_each_kmem_cache_node(), missing a new node
> >      added in parallel hotplug is not fatal, and parallel hotremove does not
> >      free kmem_cache_node's anymore after the previous patch, so use-after free
> >      cannot happen.  The per-node shrinking itself is protected by
> >      n->list_lock.  Same is true for SLAB, and SLOB is no-op.
> >      SLAB also doesn't need the memory hotplug locking, which it only gained by
> >      03afc0e25f7f through the shared paths in slab_common.c.  Its memory
> >      hotplug callbacks are also protected by slab_mutex against races with
> >      these paths.  The problem of SLUB relying on N_NORMAL_MEMORY doesn't apply
> >      to SLAB, as its setup_kmem_cache_nodes relies on N_ONLINE, and the new
> >      node is already set there during the MEM_GOING_ONLINE callback, so no
> >      special care is needed for SLAB.
> >      As such, this patch removes all get/put_online_mems() usage by the slab
> >      subsystem.
> >      Link: https://lkml.kernel.org/r/20210113131634.3671-3-vbabka@suse.cz
> >      Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >      Cc: Christoph Lameter <cl@linux.com>
> >      Cc: David Hildenbrand <david@redhat.com>
> >      Cc: David Rientjes <rientjes@google.com>
> >      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >      Cc: Michal Hocko <mhocko@kernel.org>
> >      Cc: Pekka Enberg <penberg@kernel.org>
> >      Cc: Qian Cai <cai@redhat.com>
> >      Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> >      Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> >      Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> > 
> > > 
> > >  From 518a2b83a9c5bd85d74ddabbc36ce5d181a88ed6 Mon Sep 17 00:00:00 2001
> > > From: David Hildenbrand <david@redhat.com>
> > > Date: Tue, 8 Apr 2025 12:16:13 +0200
> > > Subject: [PATCH] tmp
> > > 
> > > Signed-off-by: David Hildenbrand <david@redhat.com>
> > > ---
> > >   mm/slub.c | 56 ++++---------------------------------------------------
> > >   1 file changed, 4 insertions(+), 52 deletions(-)
> > > 
> > > diff --git a/mm/slub.c b/mm/slub.c
> > > index b46f87662e71d..afe31149e7f4e 100644
> > > --- a/mm/slub.c
> > > +++ b/mm/slub.c
> > > @@ -445,14 +445,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
> > >   	for (__node = 0; __node < nr_node_ids; __node++) \
> > >   		 if ((__n = get_node(__s, __node)))
> > > -/*
> > > - * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
> > > - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
> > > - * differ during memory hotplug/hotremove operations.
> > > - * Protected by slab_mutex.
> > > - */
> > > -static nodemask_t slab_nodes;
> > > -
> > >   #ifndef CONFIG_SLUB_TINY
> > >   /*
> > >    * Workqueue used for flush_cpu_slab().
> > > @@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >   	if (!slab) {
> > >   		/*
> > >   		 * if the node is not online or has no normal memory, just
> > > -		 * ignore the node constraint
> > > +		 * ignore the node constraint.
> > >   		 */
> > > -		if (unlikely(node != NUMA_NO_NODE &&
> > > -			     !node_isset(node, slab_nodes)))
> > > +		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
> > >   			node = NUMA_NO_NODE;
> > >   		goto new_slab;
> > >   	}
> > > @@ -3719,7 +3710,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> > >   		 * same as above but node_match() being false already
> > >   		 * implies node != NUMA_NO_NODE
> > >   		 */
> > > -		if (!node_isset(node, slab_nodes)) {
> > > +		if (!node_state(node, N_NORMAL_MEMORY)) {
> > >   			node = NUMA_NO_NODE;
> > >   		} else {
> > >   			stat(s, ALLOC_NODE_MISMATCH);
> > > @@ -5623,7 +5614,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
> > >   {
> > >   	int node;
> > > -	for_each_node_mask(node, slab_nodes) {
> > > +	for_each_node_state(node, N_NORMAL_MEMORY) {
> > >   		struct kmem_cache_node *n;
> > >   		if (slab_state == DOWN) {
> > > @@ -6164,30 +6155,6 @@ static int slab_mem_going_offline_callback(void *arg)
> > >   	return 0;
> > >   }
> > > -static void slab_mem_offline_callback(void *arg)
> > > -{
> > > -	struct memory_notify *marg = arg;
> > > -	int offline_node;
> > > -
> > > -	offline_node = marg->status_change_nid_normal;
> > > -
> > > -	/*
> > > -	 * If the node still has available memory. we need kmem_cache_node
> > > -	 * for it yet.
> > > -	 */
> > > -	if (offline_node < 0)
> > > -		return;
> > > -
> > > -	mutex_lock(&slab_mutex);
> > > -	node_clear(offline_node, slab_nodes);
> > > -	/*
> > > -	 * We no longer free kmem_cache_node structures here, as it would be
> > > -	 * racy with all get_node() users, and infeasible to protect them with
> > > -	 * slab_mutex.
> > > -	 */
> > > -	mutex_unlock(&slab_mutex);
> > > -}
> > > -
> > >   static int slab_mem_going_online_callback(void *arg)
> > >   {
> > >   	struct kmem_cache_node *n;
> > > @@ -6229,11 +6196,6 @@ static int slab_mem_going_online_callback(void *arg)
> > >   		init_kmem_cache_node(n);
> > >   		s->node[nid] = n;
> > >   	}
> > > -	/*
> > > -	 * Any cache created after this point will also have kmem_cache_node
> > > -	 * initialized for the new node.
> > > -	 */
> > > -	node_set(nid, slab_nodes);
> > >   out:
> > >   	mutex_unlock(&slab_mutex);
> > >   	return ret;
> > > @@ -6253,8 +6215,6 @@ static int slab_memory_callback(struct notifier_block *self,
> > >   		break;
> > >   	case MEM_OFFLINE:
> > >   	case MEM_CANCEL_ONLINE:
> > > -		slab_mem_offline_callback(arg);
> > > -		break;
> > >   	case MEM_ONLINE:
> > >   	case MEM_CANCEL_OFFLINE:
> > >   		break;
> > > @@ -6309,7 +6269,6 @@ void __init kmem_cache_init(void)
> > >   {
> > >   	static __initdata struct kmem_cache boot_kmem_cache,
> > >   		boot_kmem_cache_node;
> > > -	int node;
> > >   	if (debug_guardpage_minorder())
> > >   		slub_max_order = 0;
> > > @@ -6321,13 +6280,6 @@ void __init kmem_cache_init(void)
> > >   	kmem_cache_node = &boot_kmem_cache_node;
> > >   	kmem_cache = &boot_kmem_cache;
> > > -	/*
> > > -	 * Initialize the nodemask for which we will allocate per node
> > > -	 * structures. Here we don't need taking slab_mutex yet.
> > > -	 */
> > > -	for_each_node_state(node, N_NORMAL_MEMORY)
> > > -		node_set(node, slab_nodes);
> > > -
> > >   	create_boot_cache(kmem_cache_node, "kmem_cache_node",
> > >   			sizeof(struct kmem_cache_node),
> > >   			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
> > > -- 
> > > 2.48.1
> > > 
> > > 
> > > Not sure if there are any races to consider ... just an idea.
> > > 
> > > -- 
> > > Cheers,
> > > 
> > > David / dhildenb
> > > 
> > 
> -- 
> Cheers,
> 
> David / dhildenb
> 
>
Harry Yoo April 8, 2025, 3:15 p.m. UTC | #6
On Tue, Apr 08, 2025 at 02:49:43PM +0200, Oscar Salvador wrote:
> On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
> > Feel free to add a Suggested-by: if you think it applies.
> 
> Sorry David, my bad, totally missed it.
> I shall add it.
> 
> > Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
> > would have to be a N_MEMORY check.
> 
> Yes, should be N_MEMORY.
> 
> > But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?
> 
> I glanced over it and I did not see anything wrong with it, just a
> question below.
> 
> So, if Vlastimil and Harry think this is fine, we can indeed do this.

Looks fine to me.

> If so, I would combine this and the #1 first of this series and add
> your Signed-off-by as co-autor. Is that fine by you?

> > @@ -3706,10 +3698,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >  	if (!slab) {
> >  		/*
> >  		 * if the node is not online or has no normal memory, just
> > -		 * ignore the node constraint
> > +		 * ignore the node constraint.
> >  		 */
> > -		if (unlikely(node != NUMA_NO_NODE &&
> > -			     !node_isset(node, slab_nodes)))
> > +		if (unlikely(node != NUMA_NO_NODE && !node_state(node, N_NORMAL_MEMORY)))
> >  			node = NUMA_NO_NODE;
> 
> After my first patch, slab_nodes will also contain N_MEMORY nodes, which
> makes me think whether that check should be N_MEMORY?

Assuming that it's not frequent to call kmem_cache_alloc_node/kmalloc_node
with nid where the node has no normal memory, either way looks fine to
me.

It N_MEMORY check says it has some memory but it has no normal memory,
it will fail anyway because the kernel can't allocate pages from the
buddy.

But why not fall back to other nodes early when the kernel knows
it has no normal memory?

> -- 
> Oscar Salvador
> SUSE Labs
Vlastimil Babka April 8, 2025, 5:55 p.m. UTC | #7
On 4/8/25 16:25, David Hildenbrand wrote:
> On 08.04.25 16:18, Harry Yoo wrote:
>> On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
>>> On 08.04.25 10:41, Oscar Salvador wrote:
>>>> Currently, slab_mem_going_going_callback() checks whether the node has
>>>> N_NORMAL memory in order to be set in slab_nodes.
>>>> While it is true that gettind rid of that enforcing would mean
>>>> ending up with movables nodes in slab_nodes, the memory waste that comes
>>>> with that is negligible.
>>>>
>>>> So stop checking for status_change_nid_normal and just use status_change_nid
>>>> instead which works for both types of memory.
>>>>
>>>> Also, once we allocate the kmem_cache_node cache  for the node in
>>>> slab_mem_online_callback(), we never deallocate it in
>>>> slab_mem_off_callback() when the node goes memoryless, so we can just
>>>> get rid of it.
>>>>
>>>> The only side effect is that we will stop clearing the node from slab_nodes.
>>>>
>>>
>>> Feel free to add a Suggested-by: if you think it applies.
>>>
>>>
>>> Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
>>> would have to be a N_MEMORY check.
>>>
>>>
>>> But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?
>> 
>> The following commit says that SLUB has slab_nodes thingy for a reason...
>> kmem_cache_node might not be ready yet even when N_NORMAL_MEMORY check
>> says it now has normal memory.
> 
> node_states_set_node() is called from memory hotplug code after 
> MEM_GOING_ONLINE and after online_pages_range().
> 
> Pages might be isolated at that point, but node_states_set_node() is set 
> only after the memory notifier (MEM_GOING_ONLINE) was triggered.
> 
> So I don't immediately see the problem assuming that we never free the 
> structures.

Hm, I think it still exists. So consider:

- slab_mem_going_online_callback() creates kmem_cache_node for the new node
for existing caches under the mutex

Then a cache creation races with "node_states_set_node() is set only after
the memory notifier (MEM_GOING_ONLINE) was triggered" in a way that it
doesn't see the node set yet - kmem_cache_node for the new node on the new
cache will be missing.

The root issue is node_states_set_node() doesn't happen under slab_mutex.
slab_nodes update happens under the slab_mutex to avoid this race.

And once we have slab_nodes for this cache creation vs hotplug
synchronization then it's also to use it for the ___slab_alloc() checks,
although they could indeed now use node_state(node, N_NORMAL_MEMORY) once we
create the node structures and set bits in slab_nodes for N_MEMORY.

Maybe it could be solved at the memory hotplug level if it we had a new
state e.g.  N_GOING_ONLINE that was set before calling MEM_GOING_ONLINE
callbacks and was never reset. Then init_kmem_cache_nodes() could iterate on
that. But I'm not sure if slab_mutex would provide necessary synchronization
guarantees here so that the addition of node to MEM_GOING_ONLINE can't be
missed, hm.

> But yeah, this is what I raised below: "Not sure if there are any races 
> to consider" :)
> 
>> 
>> @Vlastimil maybe a dumb question but why not check s->node[nid]
>> instead of having slab_nodes bitmask?

You mean check i.e. in ___slab_alloc()? We'd still be prone to miss a
kmem_cache_node creation and degrade the particular cache that way, we'd
just avoid a NULL pointer dereference.
David Hildenbrand April 8, 2025, 6:18 p.m. UTC | #8
On 08.04.25 19:55, Vlastimil Babka wrote:
> On 4/8/25 16:25, David Hildenbrand wrote:
>> On 08.04.25 16:18, Harry Yoo wrote:
>>> On Tue, Apr 08, 2025 at 12:17:52PM +0200, David Hildenbrand wrote:
>>>> On 08.04.25 10:41, Oscar Salvador wrote:
>>>>> Currently, slab_mem_going_going_callback() checks whether the node has
>>>>> N_NORMAL memory in order to be set in slab_nodes.
>>>>> While it is true that gettind rid of that enforcing would mean
>>>>> ending up with movables nodes in slab_nodes, the memory waste that comes
>>>>> with that is negligible.
>>>>>
>>>>> So stop checking for status_change_nid_normal and just use status_change_nid
>>>>> instead which works for both types of memory.
>>>>>
>>>>> Also, once we allocate the kmem_cache_node cache  for the node in
>>>>> slab_mem_online_callback(), we never deallocate it in
>>>>> slab_mem_off_callback() when the node goes memoryless, so we can just
>>>>> get rid of it.
>>>>>
>>>>> The only side effect is that we will stop clearing the node from slab_nodes.
>>>>>
>>>>
>>>> Feel free to add a Suggested-by: if you think it applies.
>>>>
>>>>
>>>> Do we have to take care of the N_NORMAL_MEMORY check in kmem_cache_init() ? Likely it
>>>> would have to be a N_MEMORY check.
>>>>
>>>>
>>>> But, I was wondering if we could get rid of the "slab_nodes" thingy as a first step?
>>>
>>> The following commit says that SLUB has slab_nodes thingy for a reason...
>>> kmem_cache_node might not be ready yet even when N_NORMAL_MEMORY check
>>> says it now has normal memory.
>>
>> node_states_set_node() is called from memory hotplug code after
>> MEM_GOING_ONLINE and after online_pages_range().
>>
>> Pages might be isolated at that point, but node_states_set_node() is set
>> only after the memory notifier (MEM_GOING_ONLINE) was triggered.
>>
>> So I don't immediately see the problem assuming that we never free the
>> structures.
> 
> Hm, I think it still exists. So consider:

Ah, now I understand the problem, thanks for pointing that out.

> 
> - slab_mem_going_online_callback() creates kmem_cache_node for the new node
> for existing caches under the mutex
> 
> Then a cache creation races with "node_states_set_node() is set only after
> the memory notifier (MEM_GOING_ONLINE) was triggered" in a way that it
> doesn't see the node set yet - kmem_cache_node for the new node on the new
> cache will be missing.
> 
> The root issue is node_states_set_node() doesn't happen under slab_mutex.
> slab_nodes update happens under the slab_mutex to avoid this race.

We could by grabbing the mutex in MEM_GOING_ONLINE and putting it in 
MEM_CANCEL_ONLINE / MEM_ONLINE when a node is going online.

We use something similar in virtio-mem code to sync over the whole 
onlining period.

Of course, if someone would try creating a cache from inside a notifier, 
it would be problematic. (unlikely, but pointing it out)

Onlining a node is not a particularly common operation, so maybe ... 
good enough.

Let me think about other alternatives that require minimal adjustments.
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71..e716b4cb2d0e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -6164,36 +6164,12 @@  static int slab_mem_going_offline_callback(void *arg)
 	return 0;
 }
 
-static void slab_mem_offline_callback(void *arg)
-{
-	struct memory_notify *marg = arg;
-	int offline_node;
-
-	offline_node = marg->status_change_nid_normal;
-
-	/*
-	 * If the node still has available memory. we need kmem_cache_node
-	 * for it yet.
-	 */
-	if (offline_node < 0)
-		return;
-
-	mutex_lock(&slab_mutex);
-	node_clear(offline_node, slab_nodes);
-	/*
-	 * We no longer free kmem_cache_node structures here, as it would be
-	 * racy with all get_node() users, and infeasible to protect them with
-	 * slab_mutex.
-	 */
-	mutex_unlock(&slab_mutex);
-}
-
 static int slab_mem_going_online_callback(void *arg)
 {
 	struct kmem_cache_node *n;
 	struct kmem_cache *s;
 	struct memory_notify *marg = arg;
-	int nid = marg->status_change_nid_normal;
+	int nid = marg->status_change_nid;
 	int ret = 0;
 
 	/*
@@ -6251,10 +6227,6 @@  static int slab_memory_callback(struct notifier_block *self,
 	case MEM_GOING_OFFLINE:
 		ret = slab_mem_going_offline_callback(arg);
 		break;
-	case MEM_OFFLINE:
-	case MEM_CANCEL_ONLINE:
-		slab_mem_offline_callback(arg);
-		break;
 	case MEM_ONLINE:
 	case MEM_CANCEL_OFFLINE:
 		break;