diff mbox series

[v2,2/4] mm/mempolicy: Support memory hotplug in weighted interleave

Message ID 20250312075628.648-2-rakie.kim@sk.com
State New
Headers show
Series [v2,1/4] mm/mempolicy: Fix memory leaks in mempolicy_sysfs_init() | expand

Commit Message

Rakie Kim March 12, 2025, 7:56 a.m. UTC
The weighted interleave policy distributes page allocations across multiple
NUMA nodes based on their performance weight, thereby optimizing memory
bandwidth utilization. The weight values for each node are configured
through sysfs.

Previously, the sysfs entries for configuring weighted interleave were only
created during initialization. This approach had several limitations:
- Sysfs entries were generated for all possible nodes at boot time,
  including nodes without memory, leading to unnecessary sysfs creation.
- Some memory devices transition to an online state after initialization,
  but the existing implementation failed to create sysfs entries for
  these dynamically added nodes. As a result, memory hotplugged nodes
  were not properly recognized by the weighed interleave mechanism.

To resolve these issues, this patch introduces two key improvements:
1) At initialization, only nodes that are online and have memory are
   recognized, preventing the creation of unnecessary sysfs entries.
2) Nodes that become available after initialization are dynamically
   detected and integrated through the memory hotplug mechanism.

With this enhancement, the weighted interleave policy now properly supports
memory hotplug, ensuring that newly added nodes are recognized and sysfs
entries are created accordingly.

Signed-off-by: Rakie Kim <rakie.kim@sk.com>
---
 mm/mempolicy.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

Comments

Gregory Price March 12, 2025, 4:03 p.m. UTC | #1
On Wed, Mar 12, 2025 at 04:56:25PM +0900, Rakie Kim wrote:
> The weighted interleave policy distributes page allocations across multiple
> NUMA nodes based on their performance weight, thereby optimizing memory
> bandwidth utilization. The weight values for each node are configured
> through sysfs.
> 
> Previously, the sysfs entries for configuring weighted interleave were only
> created during initialization. This approach had several limitations:
> - Sysfs entries were generated for all possible nodes at boot time,
>   including nodes without memory, leading to unnecessary sysfs creation.

It's not that it's unnecessary, it's that it allowed for configuration
of nodes which may not have memory now but may have memory in the
future.  This was not well documented.

> - Some memory devices transition to an online state after initialization,
>   but the existing implementation failed to create sysfs entries for
>   these dynamically added nodes. As a result, memory hotplugged nodes
>   were not properly recognized by the weighed interleave mechanism.
> 

The current system creates 1 node per N_POSSIBLE nodes, and since nodes
can't transition between possible and not-possible your claims here are
contradictory.

I think you mean that simply switching from N_POSSIBLE to N_MEMORY is
insufficient since nodes may transition in and out of the N_MEMORY
state.  Therefore this patch utilizes a hotplug callback to add and
remove sysfs entries based on whether a node is in the N_MEMORY set.

> To resolve these issues, this patch introduces two key improvements:
> 1) At initialization, only nodes that are online and have memory are
>    recognized, preventing the creation of unnecessary sysfs entries.
> 2) Nodes that become available after initialization are dynamically
>    detected and integrated through the memory hotplug mechanism.
> 
> With this enhancement, the weighted interleave policy now properly supports
> memory hotplug, ensuring that newly added nodes are recognized and sysfs
> entries are created accordingly.
>

It doesn't "support memory hotplug" so much as it "Minimizes weighted
interleave to exclude memoryless nodes".  

> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> ---
>  mm/mempolicy.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 42 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 1691748badb2..94efff89e0be 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -113,6 +113,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/tlb.h>
>  #include <linux/uaccess.h>
> +#include <linux/memory.h>
>  
>  #include "internal.h"
>  
> @@ -3489,9 +3490,38 @@ static int add_weight_node(int nid, struct kobject *wi_kobj)
>  	return 0;
>  }
>  
> +struct kobject *wi_kobj;
> +
> +static int wi_node_notifier(struct notifier_block *nb,
> +			       unsigned long action, void *data)
> +{
> +	int err;
> +	struct memory_notify *arg = data;
> +	int nid = arg->status_change_nid;
> +
> +	if (nid < 0)
> +		goto notifier_end;
> +
> +	switch(action) {
> +	case MEM_ONLINE:
> +		err = add_weight_node(nid, wi_kobj);
> +		if (err) {
> +			pr_err("failed to add sysfs [node%d]\n", nid);
> +			kobject_put(wi_kobj);
> +			return NOTIFY_BAD;
> +		}
> +		break;
> +	case MEM_OFFLINE:
> +		sysfs_wi_node_release(node_attrs[nid], wi_kobj);
> +		break;
> +	}

I'm fairly certain this logic is wrong.  If I add two memory blocks and
then remove one, would this logic not remove the sysfs entries despite
there being a block remaining?

> +
> +notifier_end:
> +	return NOTIFY_OK;
> +}
> +
>  static int add_weighted_interleave_group(struct kobject *root_kobj)
>  {
> -	struct kobject *wi_kobj;
>  	int nid, err;
>  
>  	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> @@ -3505,16 +3535,23 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
>  		return err;
>  	}
>  
> -	for_each_node_state(nid, N_POSSIBLE) {
> +	for_each_online_node(nid) {
> +		if (!node_state(nid, N_MEMORY))

Rather than online node, why not just add for each N_MEMORY node -
regardless of if its memory is online or not?  If the memory is offline,
then it will be excluded from the weighted interleave mechanism by
nature of the node being invalid for allocations anyway.

> +			continue;
> +
>  		err = add_weight_node(nid, wi_kobj);
>  		if (err) {
>  			pr_err("failed to add sysfs [node%d]\n", nid);
> -			break;
> +			goto err_out;
>  		}
>  	}
> -	if (err)
> -		kobject_put(wi_kobj);
> +
> +	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
>  	return 0;
> +
> +err_out:
> +	kobject_put(wi_kobj);
> +	return err;
>  }
>  
>  static void mempolicy_kobj_release(struct kobject *kobj)
> -- 
> 2.34.1
>
Rakie Kim March 13, 2025, 6:33 a.m. UTC | #2
On Wed, 12 Mar 2025 12:03:01 -0400 Gregory Price <gourry@gourry.net> wrote:
> On Wed, Mar 12, 2025 at 04:56:25PM +0900, Rakie Kim wrote:
> > The weighted interleave policy distributes page allocations across multiple
> > NUMA nodes based on their performance weight, thereby optimizing memory
> > bandwidth utilization. The weight values for each node are configured
> > through sysfs.
> > 
> > Previously, the sysfs entries for configuring weighted interleave were only
> > created during initialization. This approach had several limitations:
> > - Sysfs entries were generated for all possible nodes at boot time,
> >   including nodes without memory, leading to unnecessary sysfs creation.
> 
> It's not that it's unnecessary, it's that it allowed for configuration
> of nodes which may not have memory now but may have memory in the
> future.  This was not well documented.

I will update the commit message to reflect your feedback.

> 
> > - Some memory devices transition to an online state after initialization,
> >   but the existing implementation failed to create sysfs entries for
> >   these dynamically added nodes. As a result, memory hotplugged nodes
> >   were not properly recognized by the weighed interleave mechanism.
> > 
> 
> The current system creates 1 node per N_POSSIBLE nodes, and since nodes
> can't transition between possible and not-possible your claims here are
> contradictory.
> 
> I think you mean that simply switching from N_POSSIBLE to N_MEMORY is
> insufficient since nodes may transition in and out of the N_MEMORY
> state.  Therefore this patch utilizes a hotplug callback to add and
> remove sysfs entries based on whether a node is in the N_MEMORY set.

I will update the commit message to reflect your feedback.

> 
> > To resolve these issues, this patch introduces two key improvements:
> > 1) At initialization, only nodes that are online and have memory are
> >    recognized, preventing the creation of unnecessary sysfs entries.
> > 2) Nodes that become available after initialization are dynamically
> >    detected and integrated through the memory hotplug mechanism.
> > 
> > With this enhancement, the weighted interleave policy now properly supports
> > memory hotplug, ensuring that newly added nodes are recognized and sysfs
> > entries are created accordingly.
> >
> 
> It doesn't "support memory hotplug" so much as it "Minimizes weighted
> interleave to exclude memoryless nodes".

I will update the commit message to reflect your feedback.

> 
> > Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> > ---
> >  mm/mempolicy.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 42 insertions(+), 5 deletions(-)
> > 
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 1691748badb2..94efff89e0be 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -113,6 +113,7 @@
> >  #include <asm/tlbflush.h>
> >  #include <asm/tlb.h>
> >  #include <linux/uaccess.h>
> > +#include <linux/memory.h>
> >  
> >  #include "internal.h"
> >  
> > @@ -3489,9 +3490,38 @@ static int add_weight_node(int nid, struct kobject *wi_kobj)
> >  	return 0;
> >  }
> >  
> > +struct kobject *wi_kobj;
> > +
> > +static int wi_node_notifier(struct notifier_block *nb,
> > +			       unsigned long action, void *data)
> > +{
> > +	int err;
> > +	struct memory_notify *arg = data;
> > +	int nid = arg->status_change_nid;
> > +
> > +	if (nid < 0)
> > +		goto notifier_end;
> > +
> > +	switch(action) {
> > +	case MEM_ONLINE:
> > +		err = add_weight_node(nid, wi_kobj);
> > +		if (err) {
> > +			pr_err("failed to add sysfs [node%d]\n", nid);
> > +			kobject_put(wi_kobj);
> > +			return NOTIFY_BAD;
> > +		}
> > +		break;
> > +	case MEM_OFFLINE:
> > +		sysfs_wi_node_release(node_attrs[nid], wi_kobj);
> > +		break;
> > +	}
> 
> I'm fairly certain this logic is wrong.  If I add two memory blocks and
> then remove one, would this logic not remove the sysfs entries despite
> there being a block remaining?

Regarding the assumption about node configuration:
Are you assuming that a node has two memory blocks and that
MEM_OFFLINE is triggered when one of them is offlined? If so, then
you are correct that this logic would need modification.

I performed a simple test by offlining a single memory block:
# echo 0 > /sys/devices/system/node/node2/memory100/online

In this case, MEM_OFFLINE was not triggered. However, I need to
conduct further analysis to confirm this behavior under different
conditions. I will review this in more detail and share my
findings, including the test methodology and results.

> 
> > +
> > +notifier_end:
> > +	return NOTIFY_OK;
> > +}
> > +
> >  static int add_weighted_interleave_group(struct kobject *root_kobj)
> >  {
> > -	struct kobject *wi_kobj;
> >  	int nid, err;
> >  
> >  	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> > @@ -3505,16 +3535,23 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
> >  		return err;
> >  	}
> >  
> > -	for_each_node_state(nid, N_POSSIBLE) {
> > +	for_each_online_node(nid) {
> > +		if (!node_state(nid, N_MEMORY))
> 
> Rather than online node, why not just add for each N_MEMORY node -
> regardless of if its memory is online or not?  If the memory is offline,
> then it will be excluded from the weighted interleave mechanism by
> nature of the node being invalid for allocations anyway.

Regarding the decision to check both N_MEMORY and N_ONLINE:
This was done to ensure consistency with the conditions under which
`wi_node_notifier` is triggered. Specifically, `MEM_ONLINE` is called
only when a node is in both the N_MEMORY and N_ONLINE states.

I will review this logic further. If my understanding is correct,
keeping the current implementation is the appropriate approach.
However, I will conduct additional testing to validate this and
provide further updates accordingly.

> 
> > +			continue;
> > +
> >  		err = add_weight_node(nid, wi_kobj);
> >  		if (err) {
> >  			pr_err("failed to add sysfs [node%d]\n", nid);
> > -			break;
> > +			goto err_out;
> >  		}
> >  	}
> > -	if (err)
> > -		kobject_put(wi_kobj);
> > +
> > +	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
> >  	return 0;
> > +
> > +err_out:
> > +	kobject_put(wi_kobj);
> > +	return err;
> >  }
> >  
> >  static void mempolicy_kobj_release(struct kobject *kobj)
> > -- 
> > 2.34.1
> >
Gregory Price March 13, 2025, 4:23 p.m. UTC | #3
On Thu, Mar 13, 2025 at 03:33:37PM +0900, Rakie Kim wrote:
> > I'm fairly certain this logic is wrong.  If I add two memory blocks and
> > then remove one, would this logic not remove the sysfs entries despite
> > there being a block remaining?
> 
> Regarding the assumption about node configuration:
> Are you assuming that a node has two memory blocks and that
> MEM_OFFLINE is triggered when one of them is offlined? If so, then
> you are correct that this logic would need modification.
> 
> I performed a simple test by offlining a single memory block:
> # echo 0 > /sys/devices/system/node/node2/memory100/online
> 
> In this case, MEM_OFFLINE was not triggered. However, I need to
> conduct further analysis to confirm this behavior under different
> conditions. I will review this in more detail and share my
> findings, including the test methodology and results.
>

+David - might have a quick answer to this.  I would have expected a
single memory block going offline to cause a notification.


I think the logic we care about is here:

static void node_states_check_changes_online(unsigned long nr_pages,
        struct zone *zone, struct memory_notify *arg)
{
        int nid = zone_to_nid(zone);

        arg->status_change_nid = NUMA_NO_NODE;
        arg->status_change_nid_normal = NUMA_NO_NODE;

        if (!node_state(nid, N_MEMORY))
                arg->status_change_nid = nid;
        if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
                arg->status_change_nid_normal = nid;
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
        if (arg->status_change_nid_normal >= 0)
                node_set_state(node, N_NORMAL_MEMORY);

        if (arg->status_change_nid >= 0)
                node_set_state(node, N_MEMORY);
}

int online_pages(unsigned long pfn, unsigned long nr_pages,
                       struct zone *zone, struct memory_group *group)
{
	...
        node_states_check_changes_online(nr_pages, zone, &arg);
	...
        node_states_set_node(nid, &arg);
	...
        memory_notify(MEM_ONLINE, &arg);
}

In the callback i think you want to check whether N_MEMORY is set

+	case MEM_OFFLINE:
++              if (node is !N_MEMORY)
++			sysfs_wi_node_release(node_attrs[nid], wi_kobj);
+		break;
+	}

Similar with online (don't want to double-add).


also from what I can tell, N_MEMORY implies N_ONLINE because N_ONLINE
occurs when memory blocks are added (regardless of state).

~Gregory
David Hildenbrand March 13, 2025, 10:36 p.m. UTC | #4
On 13.03.25 17:23, Gregory Price wrote:
> On Thu, Mar 13, 2025 at 03:33:37PM +0900, Rakie Kim wrote:
>>> I'm fairly certain this logic is wrong.  If I add two memory blocks and
>>> then remove one, would this logic not remove the sysfs entries despite
>>> there being a block remaining?
>>
>> Regarding the assumption about node configuration:
>> Are you assuming that a node has two memory blocks and that
>> MEM_OFFLINE is triggered when one of them is offlined? If so, then
>> you are correct that this logic would need modification.
>>
>> I performed a simple test by offlining a single memory block:
>> # echo 0 > /sys/devices/system/node/node2/memory100/online
>>
>> In this case, MEM_OFFLINE was not triggered. However, I need to
>> conduct further analysis to confirm this behavior under different
>> conditions. I will review this in more detail and share my
>> findings, including the test methodology and results.
>>
> 
> +David - might have a quick answer to this.  I would have expected a
> single memory block going offline to cause a notification.

Yes. Unless offlining failed, or the block was already offline :)

If it doesn't happen for an actual online memory block that is offline 
after the call, we would have a bug.
Rakie Kim March 14, 2025, 6 a.m. UTC | #5
On Thu, 13 Mar 2025 23:36:47 +0100 David Hildenbrand <david@redhat.com> wrote:
> On 13.03.25 17:23, Gregory Price wrote:
> > On Thu, Mar 13, 2025 at 03:33:37PM +0900, Rakie Kim wrote:
> >>> I'm fairly certain this logic is wrong.  If I add two memory blocks and
> >>> then remove one, would this logic not remove the sysfs entries despite
> >>> there being a block remaining?
> >>
> >> Regarding the assumption about node configuration:
> >> Are you assuming that a node has two memory blocks and that
> >> MEM_OFFLINE is triggered when one of them is offlined? If so, then
> >> you are correct that this logic would need modification.
> >>
> >> I performed a simple test by offlining a single memory block:
> >> # echo 0 > /sys/devices/system/node/node2/memory100/online
> >>
> >> In this case, MEM_OFFLINE was not triggered. However, I need to
> >> conduct further analysis to confirm this behavior under different
> >> conditions. I will review this in more detail and share my
> >> findings, including the test methodology and results.
> >>
> > 
> > +David - might have a quick answer to this.  I would have expected a
> > single memory block going offline to cause a notification.
> 
> Yes. Unless offlining failed, or the block was already offline :)
> 
> If it doesn't happen for an actual online memory block that is offline 
> after the call, we would have a bug.
> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
>

Hi David,

I am currently working on adding memory hotplug-related functionality
to the weighted interleave feature. While discussing this with Gregory,
a question came up. If you happen to know the answer to the following,
I would greatly appreciate your input.

I have added the following logic to call add_weight_node when a node
transitions to the N_MEMORY state to create a sysfs entry. Conversely,
when all memory blocks of a node go offline (!N_MEMORY),
I call sysfs_wi_node_release to remove the corresponding sysfs entry.

+static int wi_node_notifier(struct notifier_block *nb,
+                              unsigned long action, void *data)
+{
+       int err;
+       struct memory_notify *arg = data;
+       int nid = arg->status_change_nid;
+
+       if (nid < 0)
+               goto notifier_end;
+
+       switch(action) {
+       case MEM_ONLINE:
+               err = add_weight_node(nid, wi_kobj);
+               if (err) {
+                       pr_err("failed to add sysfs [node%d]\n", nid);
+                       kobject_put(wi_kobj);
+                       return NOTIFY_BAD;
+               }
+               break;
+       case MEM_OFFLINE:
+               sysfs_wi_node_release(node_attrs[nid], wi_kobj);
+               break;
+       }
+
+notifier_end:
+       return NOTIFY_OK;
+}

One question I have is whether the MEM_OFFLINE action in the code
below will be triggered when a node that consists of multiple memory
blocks has only one of its memory blocks transitioning to the offline state.

+       int nid = arg->status_change_nid;
+
+       if (nid < 0)
+               goto notifier_end;

Based on my analysis, wi_node_notifier should function as expected
because arg->status_change_nid only holds a non-negative value
under the following conditions:

1) !N_MEMORY -> N_MEMORY
   When the first memory block of a node transitions to the online state,
   it holds a non-negative value.
   In all other cases, it remains -1 (NUMA_NO_NODE).

2) N_MEMORY -> !N_MEMORY
   When all memory blocks of a node transition to the offline state,
   it holds a non-negative value.
   In all other cases, it remains -1 (NUMA_NO_NODE).

I would truly appreciate it if you could confirm whether this analysis is correct.
Below is a more detailed explanation of my findings.

<memory block online>
- The callback function registered in hotplug_memory_notifier
  receives the MEM_ONLINE action in online_pages.
int online_pages(unsigned long pfn, unsigned long nr_pages,
		       struct zone *zone, struct memory_group *group)
{
	struct memory_notify arg;
...
	node_states_check_changes_online(nr_pages, zone, &arg);
	ret = memory_notify(MEM_GOING_ONLINE, &arg);
...
	node_states_set_node(nid, &arg);
...
	memory_notify(MEM_ONLINE, &arg);
}

- If the node is in the !N_MEMORY state,
  arg->status_change_nid is set to the node ID.
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);

	arg->status_change_nid = NUMA_NO_NODE;
	arg->status_change_nid_normal = NUMA_NO_NODE;

	if (!node_state(nid, N_MEMORY))
		arg->status_change_nid = nid;
...
}
- If arg->status_change_nid >= 0, the node transitions to the N_MEMORY state.
static void node_states_set_node(int node, struct memory_notify *arg)
{
...
	if (arg->status_change_nid >= 0)
		node_set_state(node, N_MEMORY);
}


<memory block offline>
- The callback function registered in hotplug_memory_notifier
  receives the MEM_OFFLINE action in offline_pages.
int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
			struct zone *zone, struct memory_group *group)
{
	struct memory_notify arg;
...
	node_states_check_changes_offline(nr_pages, zone, &arg);
	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
...
	node_states_clear_node(node, &arg);
...
	memory_notify(MEM_OFFLINE, &arg);
}

- If the node becomes empty,
  arg->status_change_nid is set to the node ID.
static void node_states_check_changes_offline(unsigned long nr_pages,
		struct zone *zone, struct memory_notify *arg)
{
...
	/*
	 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
	 * does not apply as we don't support 32bit.
	 * Here we count the possible pages from ZONE_MOVABLE.
	 * If after having accounted all the pages, we see that the nr_pages
	 * to be offlined is over or equal to the accounted pages,
	 * we know that the node will become empty, and so, we can clear
	 * it for N_MEMORY as well.
	 */
	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;

	if (nr_pages >= present_pages)
		arg->status_change_nid = zone_to_nid(zone);
}

- If arg->status_change_nid >= 0,
  the node transitions to the !N_MEMORY state.
static void node_states_clear_node(int node, struct memory_notify *arg)
{
...
	if (arg->status_change_nid >= 0)
		node_clear_state(node, N_MEMORY);
}

Thank you very much for taking the time to read this lengthy email.

Rakie
David Hildenbrand March 14, 2025, 9:17 a.m. UTC | #6
> Hi David,

Hi :)

> 
> I am currently working on adding memory hotplug-related functionality
> to the weighted interleave feature. While discussing this with Gregory,
> a question came up. If you happen to know the answer to the following,
> I would greatly appreciate your input.
> 
> I have added the following logic to call add_weight_node when a node
> transitions to the N_MEMORY state to create a sysfs entry. Conversely,
> when all memory blocks of a node go offline (!N_MEMORY),
> I call sysfs_wi_node_release to remove the corresponding sysfs entry.
> 

As a spoiler: I don't like how we squeezed the status_change_nid / 
status_change_nid_normal stuff into the memory notifier that works on a 
single memory block -> single zone. But decoupling it is not as easy, 
because we have this status_change_nid vs. status_change_nid_normal thing.

For the basic "node going offline / node going online", a separate 
notifier (or separate callbacks) would make at least your use case much 
clearer.

The whole "status_change_nid_normal" is only used by slab. I wonder if 
we could get rid of it, and simply let slab check the relevant 
zone->present pages when notified about onlining/offlining of a singe 
memory block.

Then, we would only have status_change_nid and could just convert that 
to a NODE_MEM_ONLINE / NODE_MEM_OFFLINE notifier or sth like that.

Hmmm, if I wouldn't be on PTO, I would prototype that real quick :)

> +static int wi_node_notifier(struct notifier_block *nb,
> +                              unsigned long action, void *data)
> +{
> +       int err;
> +       struct memory_notify *arg = data;
> +       int nid = arg->status_change_nid;
> +
> +       if (nid < 0)
> +               goto notifier_end;
> +
> +       switch(action) {
> +       case MEM_ONLINE:
> +               err = add_weight_node(nid, wi_kobj);
> +               if (err) {
> +                       pr_err("failed to add sysfs [node%d]\n", nid);
> +                       kobject_put(wi_kobj);
> +                       return NOTIFY_BAD;
> +               }
> +               break;
> +       case MEM_OFFLINE:
> +               sysfs_wi_node_release(node_attrs[nid], wi_kobj);
> +               break;
> +       }
> +
> +notifier_end:
> +       return NOTIFY_OK;
> +}
> 
> One question I have is whether the MEM_OFFLINE action in the code
> below will be triggered when a node that consists of multiple memory
> blocks has only one of its memory blocks transitioning to the offline state.
> 

node_states_check_changes_offline() should be making sure that that is 
the case.

Only if offlining the current block will make the node (all zones) have 
no present pages any more, then we'll set status_change_nid to >= 0.


> +       int nid = arg->status_change_nid;
> +
> +       if (nid < 0)
> +               goto notifier_end;
> 
> Based on my analysis, wi_node_notifier should function as expected
> because arg->status_change_nid only holds a non-negative value
> under the following conditions:
> 
> 1) !N_MEMORY -> N_MEMORY
>     When the first memory block of a node transitions to the online state,
>     it holds a non-negative value.
>     In all other cases, it remains -1 (NUMA_NO_NODE).
> 
> 2) N_MEMORY -> !N_MEMORY
>     When all memory blocks of a node transition to the offline state,
>     it holds a non-negative value.
>     In all other cases, it remains -1 (NUMA_NO_NODE).
> 
> I would truly appreciate it if you could confirm whether this analysis is correct.
> Below is a more detailed explanation of my findings.
> 

Yes, that's at least how it is supposed to work (-bugs, but I am not 
aware of any) :)
Rakie Kim March 17, 2025, 8:23 a.m. UTC | #7
On Fri, 14 Mar 2025 10:17:26 +0100 David Hildenbrand <david@redhat.com> wrote:
> > Hi David,
> 
> Hi :)
> 
> > 
> > I am currently working on adding memory hotplug-related functionality
> > to the weighted interleave feature. While discussing this with Gregory,
> > a question came up. If you happen to know the answer to the following,
> > I would greatly appreciate your input.
> > 
> > I have added the following logic to call add_weight_node when a node
> > transitions to the N_MEMORY state to create a sysfs entry. Conversely,
> > when all memory blocks of a node go offline (!N_MEMORY),
> > I call sysfs_wi_node_release to remove the corresponding sysfs entry.
> > 
> 
> As a spoiler: I don't like how we squeezed the status_change_nid / 
> status_change_nid_normal stuff into the memory notifier that works on a 
> single memory block -> single zone. But decoupling it is not as easy, 
> because we have this status_change_nid vs. status_change_nid_normal thing.
> 
> For the basic "node going offline / node going online", a separate 
> notifier (or separate callbacks) would make at least your use case much 
> clearer.
> 
> The whole "status_change_nid_normal" is only used by slab. I wonder if 
> we could get rid of it, and simply let slab check the relevant 
> zone->present pages when notified about onlining/offlining of a singe 
> memory block.
> 
> Then, we would only have status_change_nid and could just convert that 
> to a NODE_MEM_ONLINE / NODE_MEM_OFFLINE notifier or sth like that.
> 
> Hmmm, if I wouldn't be on PTO, I would prototype that real quick :)

Hi David :)

I completely agree with your perspective on this. Having separate callbacks
for "node going offline/node going online" would certainly lead to clearer
code. For now, I shall proceed with developing the code based on the current
structure. I will also continue monitoring updates related to "node online/
offline" and plan on revising the code once those are integrated.
Thank you for your valuable input on this matter.

> 
> > +static int wi_node_notifier(struct notifier_block *nb,
> > +                              unsigned long action, void *data)
> > +{
> > +       int err;
> > +       struct memory_notify *arg = data;
> > +       int nid = arg->status_change_nid;
> > +
> > +       if (nid < 0)
> > +               goto notifier_end;
> > +
> > +       switch(action) {
> > +       case MEM_ONLINE:
> > +               err = add_weight_node(nid, wi_kobj);
> > +               if (err) {
> > +                       pr_err("failed to add sysfs [node%d]\n", nid);
> > +                       kobject_put(wi_kobj);
> > +                       return NOTIFY_BAD;
> > +               }
> > +               break;
> > +       case MEM_OFFLINE:
> > +               sysfs_wi_node_release(node_attrs[nid], wi_kobj);
> > +               break;
> > +       }
> > +
> > +notifier_end:
> > +       return NOTIFY_OK;
> > +}
> > 
> > One question I have is whether the MEM_OFFLINE action in the code
> > below will be triggered when a node that consists of multiple memory
> > blocks has only one of its memory blocks transitioning to the offline state.
> > 
> 
> node_states_check_changes_offline() should be making sure that that is 
> the case.
> 
> Only if offlining the current block will make the node (all zones) have 
> no present pages any more, then we'll set status_change_nid to >= 0.
> 

Thank you for reviewing this matter.

> 
> > +       int nid = arg->status_change_nid;
> > +
> > +       if (nid < 0)
> > +               goto notifier_end;
> > 
> > Based on my analysis, wi_node_notifier should function as expected
> > because arg->status_change_nid only holds a non-negative value
> > under the following conditions:
> > 
> > 1) !N_MEMORY -> N_MEMORY
> >     When the first memory block of a node transitions to the online state,
> >     it holds a non-negative value.
> >     In all other cases, it remains -1 (NUMA_NO_NODE).
> > 
> > 2) N_MEMORY -> !N_MEMORY
> >     When all memory blocks of a node transition to the offline state,
> >     it holds a non-negative value.
> >     In all other cases, it remains -1 (NUMA_NO_NODE).
> > 
> > I would truly appreciate it if you could confirm whether this analysis is correct.
> > Below is a more detailed explanation of my findings.
> > 
> 
> Yes, that's at least how it is supposed to work (-bugs, but I am not 
> aware of any) :)
> 

Thank you once again for reviewing this matter. Your insightful feedback has been
instrumental in crafting a more robust structure.

Rakie

> -- 
> Cheers,
> 
> David / dhildenb
>
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1691748badb2..94efff89e0be 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,6 +113,7 @@ 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <linux/uaccess.h>
+#include <linux/memory.h>
 
 #include "internal.h"
 
@@ -3489,9 +3490,38 @@  static int add_weight_node(int nid, struct kobject *wi_kobj)
 	return 0;
 }
 
+struct kobject *wi_kobj;
+
+static int wi_node_notifier(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	int err;
+	struct memory_notify *arg = data;
+	int nid = arg->status_change_nid;
+
+	if (nid < 0)
+		goto notifier_end;
+
+	switch(action) {
+	case MEM_ONLINE:
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			kobject_put(wi_kobj);
+			return NOTIFY_BAD;
+		}
+		break;
+	case MEM_OFFLINE:
+		sysfs_wi_node_release(node_attrs[nid], wi_kobj);
+		break;
+	}
+
+notifier_end:
+	return NOTIFY_OK;
+}
+
 static int add_weighted_interleave_group(struct kobject *root_kobj)
 {
-	struct kobject *wi_kobj;
 	int nid, err;
 
 	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@@ -3505,16 +3535,23 @@  static int add_weighted_interleave_group(struct kobject *root_kobj)
 		return err;
 	}
 
-	for_each_node_state(nid, N_POSSIBLE) {
+	for_each_online_node(nid) {
+		if (!node_state(nid, N_MEMORY))
+			continue;
+
 		err = add_weight_node(nid, wi_kobj);
 		if (err) {
 			pr_err("failed to add sysfs [node%d]\n", nid);
-			break;
+			goto err_out;
 		}
 	}
-	if (err)
-		kobject_put(wi_kobj);
+
+	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
 	return 0;
+
+err_out:
+	kobject_put(wi_kobj);
+	return err;
 }
 
 static void mempolicy_kobj_release(struct kobject *kobj)