Message ID | 20250404074623.1179-4-rakie.kim@sk.com |
---|---|
State | Superseded |
Headers | show |
Series | Enhance sysfs handling for memory hotplug in weighted interleave | expand |
On Fri, Apr 04, 2025 at 04:46:21PM +0900, Rakie Kim wrote: > The weighted interleave policy distributes page allocations across multiple > NUMA nodes based on their performance weight, thereby improving memory > bandwidth utilization. The weight values for each node are configured > through sysfs. > > Previously, sysfs entries for configuring weighted interleave were created > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > might not have memory. However, not all nodes in N_POSSIBLE are usable at > runtime, as some may remain memoryless or offline. > This led to sysfs entries being created for unusable nodes, causing > potential misconfiguration issues. > > To address this issue, this patch modifies the sysfs creation logic to: > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > the creation of sysfs entries for nodes that cannot be used. > 2) Support memory hotplug by dynamically adding and removing sysfs entries > based on whether a node transitions into or out of the N_MEMORY state. > > Additionally, the patch ensures that sysfs attributes are properly managed > when nodes go offline, preventing stale or redundant entries from persisting > in the system. > > By making these changes, the weighted interleave policy now manages its > sysfs entries more efficiently, ensuring that only relevant nodes are > considered for interleaving, and dynamically adapting to memory hotplug > events. > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> For the memory-hotplug bits: Reviewed-by: Oscar Salvador <osalvador@suse.de< Just one thing that caught my eye: Cannot add_weighted_interleave_group be __init? AFAICS, it only gets called at boot time?
On 04.04.25 09:46, Rakie Kim wrote: > The weighted interleave policy distributes page allocations across multiple > NUMA nodes based on their performance weight, thereby improving memory > bandwidth utilization. The weight values for each node are configured > through sysfs. > > Previously, sysfs entries for configuring weighted interleave were created > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > might not have memory. However, not all nodes in N_POSSIBLE are usable at > runtime, as some may remain memoryless or offline. > This led to sysfs entries being created for unusable nodes, causing > potential misconfiguration issues. > > To address this issue, this patch modifies the sysfs creation logic to: > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > the creation of sysfs entries for nodes that cannot be used. > 2) Support memory hotplug by dynamically adding and removing sysfs entries > based on whether a node transitions into or out of the N_MEMORY state. > > Additionally, the patch ensures that sysfs attributes are properly managed > when nodes go offline, preventing stale or redundant entries from persisting > in the system. > > By making these changes, the weighted interleave policy now manages its > sysfs entries more efficiently, ensuring that only relevant nodes are > considered for interleaving, and dynamically adapting to memory hotplug > events. > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> > --- > mm/mempolicy.c | 109 ++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 86 insertions(+), 23 deletions(-) > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 73a9405ff352..f25c2c7f8fcf 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -113,6 +113,7 @@ > #include <asm/tlbflush.h> > #include <asm/tlb.h> > #include <linux/uaccess.h> > +#include <linux/memory.h> > > #include "internal.h" > > @@ -3390,6 +3391,7 @@ struct iw_node_attr { > > struct sysfs_wi_group { > struct kobject wi_kobj; > + struct mutex kobj_lock; > struct iw_node_attr *nattrs[]; > }; > > @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, > > static void sysfs_wi_node_delete(int nid) > { > - if (!wi_group->nattrs[nid]) > + struct iw_node_attr *attr; > + > + if (nid < 0 || nid >= nr_node_ids) > + return; > + > + mutex_lock(&wi_group->kobj_lock); > + attr = wi_group->nattrs[nid]; > + if (!attr) { > + mutex_unlock(&wi_group->kobj_lock); > return; > + } > + > + wi_group->nattrs[nid] = NULL; > + mutex_unlock(&wi_group->kobj_lock); > > - sysfs_remove_file(&wi_group->wi_kobj, > - &wi_group->nattrs[nid]->kobj_attr.attr); > - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > - kfree(wi_group->nattrs[nid]); > + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); > + kfree(attr->kobj_attr.attr.name); > + kfree(attr); > } > > static void sysfs_wi_release(struct kobject *wi_kobj) > @@ -3464,35 +3477,80 @@ static const struct kobj_type wi_ktype = { > > static int sysfs_wi_node_add(int nid) > { > - struct iw_node_attr *node_attr; > + int ret = 0; > char *name; > + struct iw_node_attr *new_attr = NULL; > > - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); > - if (!node_attr) > + if (nid < 0 || nid >= nr_node_ids) { > + pr_err("Invalid node id: %d\n", nid); > + return -EINVAL; > + } > + > + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); > + if (!new_attr) > return -ENOMEM; > > name = kasprintf(GFP_KERNEL, "node%d", nid); > if (!name) { > - kfree(node_attr); > + kfree(new_attr); > return -ENOMEM; > } > > - sysfs_attr_init(&node_attr->kobj_attr.attr); > - node_attr->kobj_attr.attr.name = name; > - node_attr->kobj_attr.attr.mode = 0644; > - node_attr->kobj_attr.show = node_show; > - node_attr->kobj_attr.store = node_store; > - node_attr->nid = nid; > + mutex_lock(&wi_group->kobj_lock); > + if (wi_group->nattrs[nid]) { > + mutex_unlock(&wi_group->kobj_lock); > + pr_info("Node [%d] already exists\n", nid); > + kfree(new_attr); > + kfree(name); > + return 0; > + } > + wi_group->nattrs[nid] = new_attr; > > - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { > - kfree(node_attr->kobj_attr.attr.name); > - kfree(node_attr); > - pr_err("failed to add attribute to weighted_interleave\n"); > - return -ENOMEM; > + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); > + wi_group->nattrs[nid]->kobj_attr.attr.name = name; > + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; > + wi_group->nattrs[nid]->kobj_attr.show = node_show; > + wi_group->nattrs[nid]->kobj_attr.store = node_store; > + wi_group->nattrs[nid]->nid = nid; > + > + ret = sysfs_create_file(&wi_group->wi_kobj, > + &wi_group->nattrs[nid]->kobj_attr.attr); > + if (ret) { > + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > + kfree(wi_group->nattrs[nid]); > + wi_group->nattrs[nid] = NULL; > + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); > } > + mutex_unlock(&wi_group->kobj_lock); > > - wi_group->nattrs[nid] = node_attr; > - return 0; > + return ret; > +} > + > +static int wi_node_notifier(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + int err; > + struct memory_notify *arg = data; > + int nid = arg->status_change_nid; > + > + if (nid < 0) > + goto notifier_end; > + > + switch(action) { > + case MEM_ONLINE: MEM_ONLINE is too late, we cannot fail hotplug at that point. Would MEM_GOING_ONLINE / MEM_CANCEL_ONLINE be better? > + err = sysfs_wi_node_add(nid); > + if (err) { > + pr_err("failed to add sysfs [node%d]\n", nid); > + return NOTIFY_BAD; Note that NOTIFY_BAD includes NOTIFY_STOP_MASK. So you wouldn't call other notifiers, but the overall memory onlining would succeed, which is bad. If we don't care about the error (not prevent hotplug) we could only pr_warn() and continue. Maybe this (unlikely) case is not a good reason to stop memory from getting onlined. OTOH, it will barely ever trigger in practice ...
On Fri, 4 Apr 2025 10:43:18 +0200 Oscar Salvador <osalvador@suse.de> wrote: > On Fri, Apr 04, 2025 at 04:46:21PM +0900, Rakie Kim wrote: > > The weighted interleave policy distributes page allocations across multiple > > NUMA nodes based on their performance weight, thereby improving memory > > bandwidth utilization. The weight values for each node are configured > > through sysfs. > > > > Previously, sysfs entries for configuring weighted interleave were created > > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > > might not have memory. However, not all nodes in N_POSSIBLE are usable at > > runtime, as some may remain memoryless or offline. > > This led to sysfs entries being created for unusable nodes, causing > > potential misconfiguration issues. > > > > To address this issue, this patch modifies the sysfs creation logic to: > > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > > the creation of sysfs entries for nodes that cannot be used. > > 2) Support memory hotplug by dynamically adding and removing sysfs entries > > based on whether a node transitions into or out of the N_MEMORY state. > > > > Additionally, the patch ensures that sysfs attributes are properly managed > > when nodes go offline, preventing stale or redundant entries from persisting > > in the system. > > > > By making these changes, the weighted interleave policy now manages its > > sysfs entries more efficiently, ensuring that only relevant nodes are > > considered for interleaving, and dynamically adapting to memory hotplug > > events. > > > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> > > For the memory-hotplug bits: Reviewed-by: Oscar Salvador > <osalvador@suse.de< > > Just one thing that caught my eye: > > Cannot add_weighted_interleave_group be __init? AFAICS, it only gets > called at boot time? > > > -- > Oscar Salvador > SUSE Labs Thank you for your response regarding this patch. I agree that `add_weighted_interleave_group` can be marked with __init, as it is only called during system boot. I will make this change in the next revision. Rakie
On Fri, 4 Apr 2025 22:45:00 +0200 David Hildenbrand <david@redhat.com> wrote: > On 04.04.25 09:46, Rakie Kim wrote: > > The weighted interleave policy distributes page allocations across multiple > > NUMA nodes based on their performance weight, thereby improving memory > > bandwidth utilization. The weight values for each node are configured > > through sysfs. > > > > Previously, sysfs entries for configuring weighted interleave were created > > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > > might not have memory. However, not all nodes in N_POSSIBLE are usable at > > runtime, as some may remain memoryless or offline. > > This led to sysfs entries being created for unusable nodes, causing > > potential misconfiguration issues. > > > > To address this issue, this patch modifies the sysfs creation logic to: > > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > > the creation of sysfs entries for nodes that cannot be used. > > 2) Support memory hotplug by dynamically adding and removing sysfs entries > > based on whether a node transitions into or out of the N_MEMORY state. > > > > Additionally, the patch ensures that sysfs attributes are properly managed > > when nodes go offline, preventing stale or redundant entries from persisting > > in the system. > > > > By making these changes, the weighted interleave policy now manages its > > sysfs entries more efficiently, ensuring that only relevant nodes are > > considered for interleaving, and dynamically adapting to memory hotplug > > events. > > > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> > > --- > > mm/mempolicy.c | 109 ++++++++++++++++++++++++++++++++++++++----------- > > 1 file changed, 86 insertions(+), 23 deletions(-) > > > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > > index 73a9405ff352..f25c2c7f8fcf 100644 > > --- a/mm/mempolicy.c > > +++ b/mm/mempolicy.c > > @@ -113,6 +113,7 @@ > > #include <asm/tlbflush.h> > > #include <asm/tlb.h> > > #include <linux/uaccess.h> > > +#include <linux/memory.h> > > > > #include "internal.h" > > > > @@ -3390,6 +3391,7 @@ struct iw_node_attr { > > > > struct sysfs_wi_group { > > struct kobject wi_kobj; > > + struct mutex kobj_lock; > > struct iw_node_attr *nattrs[]; > > }; > > > > @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, > > > > static void sysfs_wi_node_delete(int nid) > > { > > - if (!wi_group->nattrs[nid]) > > + struct iw_node_attr *attr; > > + > > + if (nid < 0 || nid >= nr_node_ids) > > + return; > > + > > + mutex_lock(&wi_group->kobj_lock); > > + attr = wi_group->nattrs[nid]; > > + if (!attr) { > > + mutex_unlock(&wi_group->kobj_lock); > > return; > > + } > > + > > + wi_group->nattrs[nid] = NULL; > > + mutex_unlock(&wi_group->kobj_lock); > > > > - sysfs_remove_file(&wi_group->wi_kobj, > > - &wi_group->nattrs[nid]->kobj_attr.attr); > > - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > > - kfree(wi_group->nattrs[nid]); > > + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); > > + kfree(attr->kobj_attr.attr.name); > > + kfree(attr); > > } > > > > static void sysfs_wi_release(struct kobject *wi_kobj) > > @@ -3464,35 +3477,80 @@ static const struct kobj_type wi_ktype = { > > > > static int sysfs_wi_node_add(int nid) > > { > > - struct iw_node_attr *node_attr; > > + int ret = 0; > > char *name; > > + struct iw_node_attr *new_attr = NULL; > > > > - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); > > - if (!node_attr) > > + if (nid < 0 || nid >= nr_node_ids) { > > + pr_err("Invalid node id: %d\n", nid); > > + return -EINVAL; > > + } > > + > > + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); > > + if (!new_attr) > > return -ENOMEM; > > > > name = kasprintf(GFP_KERNEL, "node%d", nid); > > if (!name) { > > - kfree(node_attr); > > + kfree(new_attr); > > return -ENOMEM; > > } > > > > - sysfs_attr_init(&node_attr->kobj_attr.attr); > > - node_attr->kobj_attr.attr.name = name; > > - node_attr->kobj_attr.attr.mode = 0644; > > - node_attr->kobj_attr.show = node_show; > > - node_attr->kobj_attr.store = node_store; > > - node_attr->nid = nid; > > + mutex_lock(&wi_group->kobj_lock); > > + if (wi_group->nattrs[nid]) { > > + mutex_unlock(&wi_group->kobj_lock); > > + pr_info("Node [%d] already exists\n", nid); > > + kfree(new_attr); > > + kfree(name); > > + return 0; > > + } > > + wi_group->nattrs[nid] = new_attr; > > > > - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { > > - kfree(node_attr->kobj_attr.attr.name); > > - kfree(node_attr); > > - pr_err("failed to add attribute to weighted_interleave\n"); > > - return -ENOMEM; > > + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); > > + wi_group->nattrs[nid]->kobj_attr.attr.name = name; > > + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; > > + wi_group->nattrs[nid]->kobj_attr.show = node_show; > > + wi_group->nattrs[nid]->kobj_attr.store = node_store; > > + wi_group->nattrs[nid]->nid = nid; > > + > > + ret = sysfs_create_file(&wi_group->wi_kobj, > > + &wi_group->nattrs[nid]->kobj_attr.attr); > > + if (ret) { > > + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > > + kfree(wi_group->nattrs[nid]); > > + wi_group->nattrs[nid] = NULL; > > + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); > > } > > + mutex_unlock(&wi_group->kobj_lock); > > > > - wi_group->nattrs[nid] = node_attr; > > - return 0; > > + return ret; > > +} > > + > > +static int wi_node_notifier(struct notifier_block *nb, > > + unsigned long action, void *data) > > +{ > > + int err; > > + struct memory_notify *arg = data; > > + int nid = arg->status_change_nid; > > + > > + if (nid < 0) > > + goto notifier_end; > > + > > + switch(action) { > > + case MEM_ONLINE: > > MEM_ONLINE is too late, we cannot fail hotplug at that point. > > Would MEM_GOING_ONLINE / MEM_CANCEL_ONLINE be better? Hi David, Thank you for raising these points. I would appreciate your clarification on the following: Issue1: I want to invoke sysfs_wi_node_add() after a node with memory has been fully transitioned to the online state. Does replacing MEM_ONLINE with MEM_GOING_ONLINE or MEM_CANCEL_ONLINE still ensure that the node is considered online and usable by that point? > > > > + err = sysfs_wi_node_add(nid); > > + if (err) { > > + pr_err("failed to add sysfs [node%d]\n", nid); > > + return NOTIFY_BAD; > > Note that NOTIFY_BAD includes NOTIFY_STOP_MASK. So you wouldn't call > other notifiers, but the overall memory onlining would succeed, which is > bad. > > If we don't care about the error (not prevent hotplug) we could only > pr_warn() and continue. Maybe this (unlikely) case is not a good reason > to stop memory from getting onlined. OTOH, it will barely ever trigger > in practice ... > Issue2: Regarding your note about NOTIFY_BAD ? are you saying that if sysfs_wi_node_add() returns NOTIFY_BAD, it will trigger NOTIFY_STOP_MASK, preventing other notifiers from running, while still allowing the memory hotplug operation to complete? If so, then I'm thinking of resolving both issues as follows: - For Issue1: I keep using MEM_ONLINE, assuming it is safe and sufficient to ensure the node is fully online. - For Issue2: I avoid returning NOTIFY_BAD from the notifier. Instead, I log the error using pr_err() and continue the operation. This would result in the following code: if (nid < 0) return NOTIFY_OK; switch (action) { case MEM_ONLINE: // Issue1: keeping this unchanged err = sysfs_wi_node_add(nid); if (err) { pr_err("failed to add sysfs [node%d]\n", nid); // Issue2: Do not return NOTIFY_BAD } break; case MEM_OFFLINE: sysfs_wi_node_delete(nid); break; } // Always return NOTIFY_OK return NOTIFY_OK; Please let me know if this approach is acceptable. Rakie > -- > Cheers, > > David / dhildenb >
On 07.04.25 11:39, Rakie Kim wrote: > On Fri, 4 Apr 2025 22:45:00 +0200 David Hildenbrand <david@redhat.com> wrote: >> On 04.04.25 09:46, Rakie Kim wrote: >>> The weighted interleave policy distributes page allocations across multiple >>> NUMA nodes based on their performance weight, thereby improving memory >>> bandwidth utilization. The weight values for each node are configured >>> through sysfs. >>> >>> Previously, sysfs entries for configuring weighted interleave were created >>> for all possible nodes (N_POSSIBLE) at initialization, including nodes that >>> might not have memory. However, not all nodes in N_POSSIBLE are usable at >>> runtime, as some may remain memoryless or offline. >>> This led to sysfs entries being created for unusable nodes, causing >>> potential misconfiguration issues. >>> >>> To address this issue, this patch modifies the sysfs creation logic to: >>> 1) Limit sysfs entries to nodes that are online and have memory, avoiding >>> the creation of sysfs entries for nodes that cannot be used. >>> 2) Support memory hotplug by dynamically adding and removing sysfs entries >>> based on whether a node transitions into or out of the N_MEMORY state. >>> >>> Additionally, the patch ensures that sysfs attributes are properly managed >>> when nodes go offline, preventing stale or redundant entries from persisting >>> in the system. >>> >>> By making these changes, the weighted interleave policy now manages its >>> sysfs entries more efficiently, ensuring that only relevant nodes are >>> considered for interleaving, and dynamically adapting to memory hotplug >>> events. >>> >>> Signed-off-by: Rakie Kim <rakie.kim@sk.com> >>> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> >>> Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> >>> --- >>> mm/mempolicy.c | 109 ++++++++++++++++++++++++++++++++++++++----------- >>> 1 file changed, 86 insertions(+), 23 deletions(-) >>> >>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c >>> index 73a9405ff352..f25c2c7f8fcf 100644 >>> --- a/mm/mempolicy.c >>> +++ b/mm/mempolicy.c >>> @@ -113,6 +113,7 @@ >>> #include <asm/tlbflush.h> >>> #include <asm/tlb.h> >>> #include <linux/uaccess.h> >>> +#include <linux/memory.h> >>> >>> #include "internal.h" >>> >>> @@ -3390,6 +3391,7 @@ struct iw_node_attr { >>> >>> struct sysfs_wi_group { >>> struct kobject wi_kobj; >>> + struct mutex kobj_lock; >>> struct iw_node_attr *nattrs[]; >>> }; >>> >>> @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, >>> >>> static void sysfs_wi_node_delete(int nid) >>> { >>> - if (!wi_group->nattrs[nid]) >>> + struct iw_node_attr *attr; >>> + >>> + if (nid < 0 || nid >= nr_node_ids) >>> + return; >>> + >>> + mutex_lock(&wi_group->kobj_lock); >>> + attr = wi_group->nattrs[nid]; >>> + if (!attr) { >>> + mutex_unlock(&wi_group->kobj_lock); >>> return; >>> + } >>> + >>> + wi_group->nattrs[nid] = NULL; >>> + mutex_unlock(&wi_group->kobj_lock); >>> >>> - sysfs_remove_file(&wi_group->wi_kobj, >>> - &wi_group->nattrs[nid]->kobj_attr.attr); >>> - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); >>> - kfree(wi_group->nattrs[nid]); >>> + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); >>> + kfree(attr->kobj_attr.attr.name); >>> + kfree(attr); >>> } >>> >>> static void sysfs_wi_release(struct kobject *wi_kobj) >>> @@ -3464,35 +3477,80 @@ static const struct kobj_type wi_ktype = { >>> >>> static int sysfs_wi_node_add(int nid) >>> { >>> - struct iw_node_attr *node_attr; >>> + int ret = 0; >>> char *name; >>> + struct iw_node_attr *new_attr = NULL; >>> >>> - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); >>> - if (!node_attr) >>> + if (nid < 0 || nid >= nr_node_ids) { >>> + pr_err("Invalid node id: %d\n", nid); >>> + return -EINVAL; >>> + } >>> + >>> + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); >>> + if (!new_attr) >>> return -ENOMEM; >>> >>> name = kasprintf(GFP_KERNEL, "node%d", nid); >>> if (!name) { >>> - kfree(node_attr); >>> + kfree(new_attr); >>> return -ENOMEM; >>> } >>> >>> - sysfs_attr_init(&node_attr->kobj_attr.attr); >>> - node_attr->kobj_attr.attr.name = name; >>> - node_attr->kobj_attr.attr.mode = 0644; >>> - node_attr->kobj_attr.show = node_show; >>> - node_attr->kobj_attr.store = node_store; >>> - node_attr->nid = nid; >>> + mutex_lock(&wi_group->kobj_lock); >>> + if (wi_group->nattrs[nid]) { >>> + mutex_unlock(&wi_group->kobj_lock); >>> + pr_info("Node [%d] already exists\n", nid); >>> + kfree(new_attr); >>> + kfree(name); >>> + return 0; >>> + } >>> + wi_group->nattrs[nid] = new_attr; >>> >>> - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { >>> - kfree(node_attr->kobj_attr.attr.name); >>> - kfree(node_attr); >>> - pr_err("failed to add attribute to weighted_interleave\n"); >>> - return -ENOMEM; >>> + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); >>> + wi_group->nattrs[nid]->kobj_attr.attr.name = name; >>> + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; >>> + wi_group->nattrs[nid]->kobj_attr.show = node_show; >>> + wi_group->nattrs[nid]->kobj_attr.store = node_store; >>> + wi_group->nattrs[nid]->nid = nid; >>> + >>> + ret = sysfs_create_file(&wi_group->wi_kobj, >>> + &wi_group->nattrs[nid]->kobj_attr.attr); >>> + if (ret) { >>> + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); >>> + kfree(wi_group->nattrs[nid]); >>> + wi_group->nattrs[nid] = NULL; >>> + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); >>> } >>> + mutex_unlock(&wi_group->kobj_lock); >>> >>> - wi_group->nattrs[nid] = node_attr; >>> - return 0; >>> + return ret; >>> +} >>> + >>> +static int wi_node_notifier(struct notifier_block *nb, >>> + unsigned long action, void *data) >>> +{ >>> + int err; >>> + struct memory_notify *arg = data; >>> + int nid = arg->status_change_nid; >>> + >>> + if (nid < 0) >>> + goto notifier_end; >>> + >>> + switch(action) { >>> + case MEM_ONLINE: >> >> MEM_ONLINE is too late, we cannot fail hotplug at that point. >> >> Would MEM_GOING_ONLINE / MEM_CANCEL_ONLINE be better? > > Hi David, Hi, > > Thank you for raising these points. I would appreciate your clarification > on the following: > > Issue1: I want to invoke sysfs_wi_node_add() after a node with memory > has been fully transitioned to the online state. Does replacing > MEM_ONLINE with MEM_GOING_ONLINE or MEM_CANCEL_ONLINE still ensure > that the node is considered online and usable by that point? After MEM_GOING_ONLINE nothing can go wrong except that some other notifier fails MEM_GOING_ONLINE. That happens rarely -- only if some other allocation, like for kasan, fails. In MEM_CANCEL_ONLINE you have to undo what you did (remove the node again). > >> >> >>> + err = sysfs_wi_node_add(nid); >>> + if (err) { >>> + pr_err("failed to add sysfs [node%d]\n", nid); >>> + return NOTIFY_BAD; >> >> Note that NOTIFY_BAD includes NOTIFY_STOP_MASK. So you wouldn't call >> other notifiers, but the overall memory onlining would succeed, which is >> bad. >> >> If we don't care about the error (not prevent hotplug) we could only >> pr_warn() and continue. Maybe this (unlikely) case is not a good reason >> to stop memory from getting onlined. OTOH, it will barely ever trigger >> in practice ... >> > > Issue2: Regarding your note about NOTIFY_BAD ? are you saying that > if sysfs_wi_node_add() returns NOTIFY_BAD, it will trigger > NOTIFY_STOP_MASK, preventing other notifiers from running, while > still allowing the memory hotplug operation to complete? Yes. > > If so, then I'm thinking of resolving both issues as follows: > - For Issue1: I keep using MEM_ONLINE, assuming it is safe and > sufficient to ensure the node is fully online. > - For Issue2: I avoid returning NOTIFY_BAD from the notifier. > Instead, I log the error using pr_err() and continue the operation. > > This would result in the following code: > > if (nid < 0) > return NOTIFY_OK; > > switch (action) { > case MEM_ONLINE: // Issue1: keeping this unchanged > err = sysfs_wi_node_add(nid); > if (err) { > pr_err("failed to add sysfs [node%d]\n", nid); > // Issue2: Do not return NOTIFY_BAD > } > break; > case MEM_OFFLINE: > sysfs_wi_node_delete(nid); > break; > } > > // Always return NOTIFY_OK > return NOTIFY_OK; > > Please let me know if this approach is acceptable. That would work. The alternative is failing during MEM_GOING_ONLINE if the allocation failed, and deleting the node during MEM_CANCEL_ONLINE.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73a9405ff352..f25c2c7f8fcf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -113,6 +113,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" @@ -3390,6 +3391,7 @@ struct iw_node_attr { struct sysfs_wi_group { struct kobject wi_kobj; + struct mutex kobj_lock; struct iw_node_attr *nattrs[]; }; @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, static void sysfs_wi_node_delete(int nid) { - if (!wi_group->nattrs[nid]) + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); - sysfs_remove_file(&wi_group->wi_kobj, - &wi_group->nattrs[nid]->kobj_attr.attr); - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); - kfree(wi_group->nattrs[nid]); + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); } static void sysfs_wi_release(struct kobject *wi_kobj) @@ -3464,35 +3477,80 @@ static const struct kobj_type wi_ktype = { static int sysfs_wi_node_add(int nid) { - struct iw_node_attr *node_attr; + int ret = 0; char *name; + struct iw_node_attr *new_attr = NULL; - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); - if (!node_attr) + if (nid < 0 || nid >= nr_node_ids) { + pr_err("Invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); + if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { - kfree(node_attr); + kfree(new_attr); return -ENOMEM; } - sysfs_attr_init(&node_attr->kobj_attr.attr); - node_attr->kobj_attr.attr.name = name; - node_attr->kobj_attr.attr.mode = 0644; - node_attr->kobj_attr.show = node_show; - node_attr->kobj_attr.store = node_store; - node_attr->nid = nid; + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + pr_info("Node [%d] already exists\n", nid); + kfree(new_attr); + kfree(name); + return 0; + } + wi_group->nattrs[nid] = new_attr; - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); - pr_err("failed to add attribute to weighted_interleave\n"); - return -ENOMEM; + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); + wi_group->nattrs[nid]->kobj_attr.attr.name = name; + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; + wi_group->nattrs[nid]->kobj_attr.show = node_show; + wi_group->nattrs[nid]->kobj_attr.store = node_store; + wi_group->nattrs[nid]->nid = nid; + + ret = sysfs_create_file(&wi_group->wi_kobj, + &wi_group->nattrs[nid]->kobj_attr.attr); + if (ret) { + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); + kfree(wi_group->nattrs[nid]); + wi_group->nattrs[nid] = NULL; + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); } + mutex_unlock(&wi_group->kobj_lock); - wi_group->nattrs[nid] = node_attr; - return 0; + return ret; +} + +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct memory_notify *arg = data; + int nid = arg->status_change_nid; + + if (nid < 0) + goto notifier_end; + + switch(action) { + case MEM_ONLINE: + err = sysfs_wi_node_add(nid); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + return NOTIFY_BAD; + } + break; + case MEM_OFFLINE: + sysfs_wi_node_delete(nid); + break; + } + +notifier_end: + return NOTIFY_OK; } static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) @@ -3503,13 +3561,17 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) GFP_KERNEL); if (!wi_group) return -ENOMEM; + mutex_init(&wi_group->kobj_lock); err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); if (err) goto err_out; - for_each_node_state(nid, N_POSSIBLE) { + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + err = sysfs_wi_node_add(nid); if (err) { pr_err("failed to add sysfs [node%d]\n", nid); @@ -3517,6 +3579,7 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) } } + hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); return 0; err_del: