diff mbox series

[v7,2/3] mm/mempolicy: Prepare weighted interleave sysfs for memory hotplug

Message ID 20250408073243.488-3-rakie.kim@sk.com (mailing list archive)
State New
Headers show
Series Enhance sysfs handling for memory hotplug in weighted interleave | expand

Commit Message

Rakie Kim April 8, 2025, 7:32 a.m. UTC
Previously, the weighted interleave sysfs structure was statically
managed during initialization. This prevented new nodes from being
recognized when memory hotplug events occurred, limiting the ability
to update or extend sysfs entries dynamically at runtime.

To address this, this patch refactors the sysfs infrastructure and
encapsulates it within a new structure, `sysfs_wi_group`, which holds
both the kobject and an array of node attribute pointers.

By allocating this group structure globally, the per-node sysfs
attributes can be managed beyond initialization time, enabling
external modules to insert or remove node entries in response to
events such as memory hotplug or node online/offline transitions.

Instead of allocating all per-node sysfs attributes at once, the
initialization path now uses the existing sysfs_wi_node_add() and
sysfs_wi_node_delete() helpers. This refactoring makes it possible
to modularly manage per-node sysfs entries and ensures the
infrastructure is ready for runtime extension.

Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
---
 mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 32 deletions(-)

Comments

Joshua Hahn April 8, 2025, 1:49 p.m. UTC | #1
On Tue,  8 Apr 2025 16:32:41 +0900 Rakie Kim <rakie.kim@sk.com> wrote:

Hi Rakie,
This also looks good to me!

Reviewed-by: Joshua Hahn <joshua.hahnjy@gmail.com>

> Previously, the weighted interleave sysfs structure was statically
> managed during initialization. This prevented new nodes from being
> recognized when memory hotplug events occurred, limiting the ability
> to update or extend sysfs entries dynamically at runtime.
> 
> To address this, this patch refactors the sysfs infrastructure and
> encapsulates it within a new structure, `sysfs_wi_group`, which holds
> both the kobject and an array of node attribute pointers.
> 
> By allocating this group structure globally, the per-node sysfs
> attributes can be managed beyond initialization time, enabling
> external modules to insert or remove node entries in response to
> events such as memory hotplug or node online/offline transitions.
> 
> Instead of allocating all per-node sysfs attributes at once, the
> initialization path now uses the existing sysfs_wi_node_add() and
> sysfs_wi_node_delete() helpers. This refactoring makes it possible
> to modularly manage per-node sysfs entries and ensures the
> infrastructure is ready for runtime extension.
> 
> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>
> ---
>  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
>  1 file changed, 29 insertions(+), 32 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 0da102aa1cfc..988575f29c53 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -3419,6 +3419,13 @@ struct iw_node_attr {
>  	int nid;
>  };
>  
> +struct sysfs_wi_group {
> +	struct kobject wi_kobj;
> +	struct iw_node_attr *nattrs[];
> +};
> +
> +static struct sysfs_wi_group *wi_group;
> +
>  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
>  			 char *buf)
>  {
> @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>  	return count;
>  }
>  
> -static struct iw_node_attr **node_attrs;
> -
> -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> -				  struct kobject *parent)
> +static void sysfs_wi_node_delete(int nid)
>  {
> -	if (!node_attr)
> +	if (!wi_group->nattrs[nid])
>  		return;
> -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> -	kfree(node_attr->kobj_attr.attr.name);
> -	kfree(node_attr);
> +
> +	sysfs_remove_file(&wi_group->wi_kobj,
> +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> +	kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
> +	kfree(wi_group->nattrs[nid]);
>  }
>  
>  static void sysfs_wi_release(struct kobject *wi_kobj)
>  {
> -	int i;
> -
> -	for (i = 0; i < nr_node_ids; i++)
> -		sysfs_wi_node_release(node_attrs[i], wi_kobj);
> +	int nid;
>  
> -	kfree(node_attrs);
> -	kfree(wi_kobj);
> +	for (nid = 0; nid < nr_node_ids; nid++)
> +		sysfs_wi_node_delete(nid);
> +	kfree(wi_group);
>  }
>  
>  static const struct kobj_type wi_ktype = {
> @@ -3489,7 +3493,7 @@ static const struct kobj_type wi_ktype = {
>  	.release = sysfs_wi_release,
>  };
>  
> -static int add_weight_node(int nid, struct kobject *wi_kobj)
> +static int sysfs_wi_node_add(int nid)
>  {
>  	struct iw_node_attr *node_attr;
>  	char *name;
> @@ -3511,40 +3515,33 @@ static int add_weight_node(int nid, struct kobject *wi_kobj)
>  	node_attr->kobj_attr.store = node_store;
>  	node_attr->nid = nid;
>  
> -	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
> +	if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) {
>  		kfree(node_attr->kobj_attr.attr.name);
>  		kfree(node_attr);
>  		pr_err("failed to add attribute to weighted_interleave\n");
>  		return -ENOMEM;
>  	}
>  
> -	node_attrs[nid] = node_attr;
> +	wi_group->nattrs[nid] = node_attr;
>  	return 0;
>  }
>  
> -static int add_weighted_interleave_group(struct kobject *root_kobj)
> +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
>  {
> -	struct kobject *wi_kobj;
>  	int nid, err;
>  
> -	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
> -			     GFP_KERNEL);
> -	if (!node_attrs)
> +	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
> +			   GFP_KERNEL);
> +	if (!wi_group)
>  		return -ENOMEM;
>  
> -	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> -	if (!wi_kobj) {
> -		kfree(node_attrs);
> -		return -ENOMEM;
> -	}
> -
> -	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
> +	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
>  				   "weighted_interleave");
>  	if (err)
>  		goto err_put_kobj;
>  
>  	for_each_node_state(nid, N_POSSIBLE) {
> -		err = add_weight_node(nid, wi_kobj);
> +		err = sysfs_wi_node_add(nid);
>  		if (err) {
>  			pr_err("failed to add sysfs [node%d]\n", nid);
>  			goto err_del_kobj;
> @@ -3554,9 +3551,9 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
>  	return 0;
>  
>  err_del_kobj:
> -	kobject_del(wi_kobj);
> +	kobject_del(&wi_group->wi_kobj);
>  err_put_kobj:
> -	kobject_put(wi_kobj);
> +	kobject_put(&wi_group->wi_kobj);
>  	return err;
>  }
>  
> -- 
> 2.34.1

Sent using hkml (https://github.com/sjp38/hackermail)
Dan Williams April 9, 2025, 3:43 a.m. UTC | #2
Rakie Kim wrote:
> Previously, the weighted interleave sysfs structure was statically
> managed during initialization. This prevented new nodes from being
> recognized when memory hotplug events occurred, limiting the ability
> to update or extend sysfs entries dynamically at runtime.
> 
> To address this, this patch refactors the sysfs infrastructure and
> encapsulates it within a new structure, `sysfs_wi_group`, which holds
> both the kobject and an array of node attribute pointers.
> 
> By allocating this group structure globally, the per-node sysfs
> attributes can be managed beyond initialization time, enabling
> external modules to insert or remove node entries in response to
> events such as memory hotplug or node online/offline transitions.
> 
> Instead of allocating all per-node sysfs attributes at once, the
> initialization path now uses the existing sysfs_wi_node_add() and
> sysfs_wi_node_delete() helpers. This refactoring makes it possible
> to modularly manage per-node sysfs entries and ensures the
> infrastructure is ready for runtime extension.
> 
> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>
> ---
>  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
>  1 file changed, 29 insertions(+), 32 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 0da102aa1cfc..988575f29c53 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -3419,6 +3419,13 @@ struct iw_node_attr {
>  	int nid;
>  };
>  
> +struct sysfs_wi_group {
> +	struct kobject wi_kobj;
> +	struct iw_node_attr *nattrs[];
> +};
> +
> +static struct sysfs_wi_group *wi_group;
> +
>  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
>  			 char *buf)
>  {
> @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>  	return count;
>  }
>  
> -static struct iw_node_attr **node_attrs;
> -
> -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> -				  struct kobject *parent)
> +static void sysfs_wi_node_delete(int nid)
>  {
> -	if (!node_attr)
> +	if (!wi_group->nattrs[nid])
>  		return;
> -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> -	kfree(node_attr->kobj_attr.attr.name);
> -	kfree(node_attr);
> +
> +	sysfs_remove_file(&wi_group->wi_kobj,
> +			  &wi_group->nattrs[nid]->kobj_attr.attr);

This still looks broken to me, but I think this is more a problem that
was present in the original code.

At this point @wi_group's reference count is zero because
sysfs_wi_release() has been called. However, it can only be zero if it has
properly transitioned through kobject_del() and final kobject_put(). It
follows that kobject_del() arranges for kobj->sd to be NULL. That means
that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
for the !parent case.

So, either you are not triggering that path, or testing that path, but
sys_remove_file() of the child attributes should be happening *before*
sysfs_wi_release().

Did I miss something?
Dan Williams April 9, 2025, 3:54 a.m. UTC | #3
Dan Williams wrote:
> Rakie Kim wrote:
> > Previously, the weighted interleave sysfs structure was statically
> > managed during initialization. This prevented new nodes from being
> > recognized when memory hotplug events occurred, limiting the ability
> > to update or extend sysfs entries dynamically at runtime.
> > 
> > To address this, this patch refactors the sysfs infrastructure and
> > encapsulates it within a new structure, `sysfs_wi_group`, which holds
> > both the kobject and an array of node attribute pointers.
> > 
> > By allocating this group structure globally, the per-node sysfs
> > attributes can be managed beyond initialization time, enabling
> > external modules to insert or remove node entries in response to
> > events such as memory hotplug or node online/offline transitions.
> > 
> > Instead of allocating all per-node sysfs attributes at once, the
> > initialization path now uses the existing sysfs_wi_node_add() and
> > sysfs_wi_node_delete() helpers. This refactoring makes it possible
> > to modularly manage per-node sysfs entries and ensures the
> > infrastructure is ready for runtime extension.
> > 
> > Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> > Reviewed-by: Gregory Price <gourry@gourry.net>
> > ---
> >  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
> >  1 file changed, 29 insertions(+), 32 deletions(-)
> > 
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 0da102aa1cfc..988575f29c53 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -3419,6 +3419,13 @@ struct iw_node_attr {
> >  	int nid;
> >  };
> >  
> > +struct sysfs_wi_group {
> > +	struct kobject wi_kobj;
> > +	struct iw_node_attr *nattrs[];
> > +};
> > +
> > +static struct sysfs_wi_group *wi_group;
> > +
> >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> >  			 char *buf)
> >  {
> > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> >  	return count;
> >  }
> >  
> > -static struct iw_node_attr **node_attrs;
> > -
> > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > -				  struct kobject *parent)
> > +static void sysfs_wi_node_delete(int nid)
> >  {
> > -	if (!node_attr)
> > +	if (!wi_group->nattrs[nid])
> >  		return;
> > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > -	kfree(node_attr->kobj_attr.attr.name);
> > -	kfree(node_attr);
> > +
> > +	sysfs_remove_file(&wi_group->wi_kobj,
> > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> 
> This still looks broken to me, but I think this is more a problem that
> was present in the original code.
> 
> At this point @wi_group's reference count is zero because
> sysfs_wi_release() has been called. However, it can only be zero if it has
> properly transitioned through kobject_del() and final kobject_put(). It
> follows that kobject_del() arranges for kobj->sd to be NULL. That means
> that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> for the !parent case.
> 
> So, either you are not triggering that path, or testing that path, but
> sys_remove_file() of the child attributes should be happening *before*
> sysfs_wi_release().
> 
> Did I miss something?

I think the missing change is that sysfs_wi_node_add() failures need to
be done with a sysfs_wi_node_delete() of the added attrs *before* the
kobject_del() of @wi_group.
Rakie Kim April 9, 2025, 5:56 a.m. UTC | #4
On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> Dan Williams wrote:
> > Rakie Kim wrote:
> > > Previously, the weighted interleave sysfs structure was statically
> > > managed during initialization. This prevented new nodes from being
> > > recognized when memory hotplug events occurred, limiting the ability
> > > to update or extend sysfs entries dynamically at runtime.
> > > 
> > > To address this, this patch refactors the sysfs infrastructure and
> > > encapsulates it within a new structure, `sysfs_wi_group`, which holds
> > > both the kobject and an array of node attribute pointers.
> > > 
> > > By allocating this group structure globally, the per-node sysfs
> > > attributes can be managed beyond initialization time, enabling
> > > external modules to insert or remove node entries in response to
> > > events such as memory hotplug or node online/offline transitions.
> > > 
> > > Instead of allocating all per-node sysfs attributes at once, the
> > > initialization path now uses the existing sysfs_wi_node_add() and
> > > sysfs_wi_node_delete() helpers. This refactoring makes it possible
> > > to modularly manage per-node sysfs entries and ensures the
> > > infrastructure is ready for runtime extension.
> > > 
> > > Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> > > Reviewed-by: Gregory Price <gourry@gourry.net>
> > > ---
> > >  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
> > >  1 file changed, 29 insertions(+), 32 deletions(-)
> > > 
> > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > > index 0da102aa1cfc..988575f29c53 100644
> > > --- a/mm/mempolicy.c
> > > +++ b/mm/mempolicy.c
> > > @@ -3419,6 +3419,13 @@ struct iw_node_attr {
> > >  	int nid;
> > >  };
> > >  
> > > +struct sysfs_wi_group {
> > > +	struct kobject wi_kobj;
> > > +	struct iw_node_attr *nattrs[];
> > > +};
> > > +
> > > +static struct sysfs_wi_group *wi_group;
> > > +
> > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > >  			 char *buf)
> > >  {
> > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > >  	return count;
> > >  }
> > >  
> > > -static struct iw_node_attr **node_attrs;
> > > -
> > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > -				  struct kobject *parent)
> > > +static void sysfs_wi_node_delete(int nid)
> > >  {
> > > -	if (!node_attr)
> > > +	if (!wi_group->nattrs[nid])
> > >  		return;
> > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > -	kfree(node_attr->kobj_attr.attr.name);
> > > -	kfree(node_attr);
> > > +
> > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > 
> > This still looks broken to me, but I think this is more a problem that
> > was present in the original code.
> > 
> > At this point @wi_group's reference count is zero because
> > sysfs_wi_release() has been called. However, it can only be zero if it has
> > properly transitioned through kobject_del() and final kobject_put(). It
> > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > for the !parent case.
> > 
> > So, either you are not triggering that path, or testing that path, but
> > sys_remove_file() of the child attributes should be happening *before*
> > sysfs_wi_release().
> > 
> > Did I miss something?
> 
> I think the missing change is that sysfs_wi_node_add() failures need to
> be done with a sysfs_wi_node_delete() of the added attrs *before* the
> kobject_del() of @wi_group.

Hi Dan Williams

Thank you very much for identifying this potential issue in the code.

As you pointed out, this seems to be a problem that was already present in
the original implementation, and I agree that it needs to be addressed.

However, since this issue existed prior to the changes in this patch
series, I believe it would be more appropriate to fix it in a separate
follow-up patch rather than include it here.

I will start preparing a new patch to address this problem, and I would
greatly appreciate it if you could review it once it's ready.

Rakie
Dan Williams April 9, 2025, 6:51 p.m. UTC | #5
Rakie Kim wrote:
> On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > Dan Williams wrote:
> > > Rakie Kim wrote:
> > > > Previously, the weighted interleave sysfs structure was statically
> > > > managed during initialization. This prevented new nodes from being
> > > > recognized when memory hotplug events occurred, limiting the ability
> > > > to update or extend sysfs entries dynamically at runtime.
> > > > 
> > > > To address this, this patch refactors the sysfs infrastructure and
> > > > encapsulates it within a new structure, `sysfs_wi_group`, which holds
> > > > both the kobject and an array of node attribute pointers.
> > > > 
> > > > By allocating this group structure globally, the per-node sysfs
> > > > attributes can be managed beyond initialization time, enabling
> > > > external modules to insert or remove node entries in response to
> > > > events such as memory hotplug or node online/offline transitions.
> > > > 
> > > > Instead of allocating all per-node sysfs attributes at once, the
> > > > initialization path now uses the existing sysfs_wi_node_add() and
> > > > sysfs_wi_node_delete() helpers. This refactoring makes it possible
> > > > to modularly manage per-node sysfs entries and ensures the
> > > > infrastructure is ready for runtime extension.
> > > > 
> > > > Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> > > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > > > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> > > > Reviewed-by: Gregory Price <gourry@gourry.net>
> > > > ---
> > > >  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
> > > >  1 file changed, 29 insertions(+), 32 deletions(-)
> > > > 
> > > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > > > index 0da102aa1cfc..988575f29c53 100644
> > > > --- a/mm/mempolicy.c
> > > > +++ b/mm/mempolicy.c
> > > > @@ -3419,6 +3419,13 @@ struct iw_node_attr {
> > > >  	int nid;
> > > >  };
> > > >  
> > > > +struct sysfs_wi_group {
> > > > +	struct kobject wi_kobj;
> > > > +	struct iw_node_attr *nattrs[];
> > > > +};
> > > > +
> > > > +static struct sysfs_wi_group *wi_group;
> > > > +
> > > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  			 char *buf)
> > > >  {
> > > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  	return count;
> > > >  }
> > > >  
> > > > -static struct iw_node_attr **node_attrs;
> > > > -
> > > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > > -				  struct kobject *parent)
> > > > +static void sysfs_wi_node_delete(int nid)
> > > >  {
> > > > -	if (!node_attr)
> > > > +	if (!wi_group->nattrs[nid])
> > > >  		return;
> > > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > -	kfree(node_attr->kobj_attr.attr.name);
> > > > -	kfree(node_attr);
> > > > +
> > > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > > 
> > > This still looks broken to me, but I think this is more a problem that
> > > was present in the original code.
> > > 
> > > At this point @wi_group's reference count is zero because
> > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > properly transitioned through kobject_del() and final kobject_put(). It
> > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > for the !parent case.
> > > 
> > > So, either you are not triggering that path, or testing that path, but
> > > sys_remove_file() of the child attributes should be happening *before*
> > > sysfs_wi_release().
> > > 
> > > Did I miss something?
> > 
> > I think the missing change is that sysfs_wi_node_add() failures need to
> > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > kobject_del() of @wi_group.
> 
> Hi Dan Williams
> 
> Thank you very much for identifying this potential issue in the code.
> 
> As you pointed out, this seems to be a problem that was already present in
> the original implementation, and I agree that it needs to be addressed.
> 
> However, since this issue existed prior to the changes in this patch
> series, I believe it would be more appropriate to fix it in a separate
> follow-up patch rather than include it here.

I tend to disagree. The whole motivation of this series is to get the
kobject lifetime handling correct in order to add the new dynamic
capability. The claimed correctness fixups are incomplete. There is time
to respin this (we are only at -rc1) and get it right before landing the
new dynamic capability.

One of the outcomes of the "MM Process" topic at LSF/MM was that Andrew
wanted more feedback on when patches are not quite ready for prime-time
and I think this is an example of a patch set that deserves another spin
to meet the stated goals.

> I will start preparing a new patch to address this problem, and I would
> greatly appreciate it if you could review it once it's ready.

Will definitely review it. I will leave to Andrew if he wants an
incremental fixup on top of this series, or rebase on top of a fully
fixed baseline. My preference is finish fixing all the old kobject()
issues and then rebase the new dynamic work on top. Either way, do not
be afraid to ask Andrew to replace a series in -mm, that's a sign of the
process working as expected.
Rakie Kim April 10, 2025, 7:53 a.m. UTC | #6
On Wed, 9 Apr 2025 11:51:36 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> Rakie Kim wrote:
> > On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > > Dan Williams wrote:
> > > > Rakie Kim wrote:
> > > > > Previously, the weighted interleave sysfs structure was statically
> > > > > managed during initialization. This prevented new nodes from being
> > > > > recognized when memory hotplug events occurred, limiting the ability
> > > > > to update or extend sysfs entries dynamically at runtime.
> > > > > 
> > > > > To address this, this patch refactors the sysfs infrastructure and
> > > > > encapsulates it within a new structure, `sysfs_wi_group`, which holds
> > > > > both the kobject and an array of node attribute pointers.
> > > > > 
> > > > > By allocating this group structure globally, the per-node sysfs
> > > > > attributes can be managed beyond initialization time, enabling
> > > > > external modules to insert or remove node entries in response to
> > > > > events such as memory hotplug or node online/offline transitions.
> > > > > 
> > > > > Instead of allocating all per-node sysfs attributes at once, the
> > > > > initialization path now uses the existing sysfs_wi_node_add() and
> > > > > sysfs_wi_node_delete() helpers. This refactoring makes it possible
> > > > > to modularly manage per-node sysfs entries and ensures the
> > > > > infrastructure is ready for runtime extension.
> > > > > 
> > > > > Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> > > > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > > > > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
> > > > > Reviewed-by: Gregory Price <gourry@gourry.net>
> > > > > ---
> > > > >  mm/mempolicy.c | 61 ++++++++++++++++++++++++--------------------------
> > > > >  1 file changed, 29 insertions(+), 32 deletions(-)
> > > > > 
> > > > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > > > > index 0da102aa1cfc..988575f29c53 100644
> > > > > --- a/mm/mempolicy.c
> > > > > +++ b/mm/mempolicy.c
> > > > > @@ -3419,6 +3419,13 @@ struct iw_node_attr {
> > > > >  	int nid;
> > > > >  };
> > > > >  
> > > > > +struct sysfs_wi_group {
> > > > > +	struct kobject wi_kobj;
> > > > > +	struct iw_node_attr *nattrs[];
> > > > > +};
> > > > > +
> > > > > +static struct sysfs_wi_group *wi_group;
> > > > > +
> > > > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > > > >  			 char *buf)
> > > > >  {
> > > > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > > > >  	return count;
> > > > >  }
> > > > >  
> > > > > -static struct iw_node_attr **node_attrs;
> > > > > -
> > > > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > > > -				  struct kobject *parent)
> > > > > +static void sysfs_wi_node_delete(int nid)
> > > > >  {
> > > > > -	if (!node_attr)
> > > > > +	if (!wi_group->nattrs[nid])
> > > > >  		return;
> > > > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > > -	kfree(node_attr->kobj_attr.attr.name);
> > > > > -	kfree(node_attr);
> > > > > +
> > > > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > > > 
> > > > This still looks broken to me, but I think this is more a problem that
> > > > was present in the original code.
> > > > 
> > > > At this point @wi_group's reference count is zero because
> > > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > > properly transitioned through kobject_del() and final kobject_put(). It
> > > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > > for the !parent case.
> > > > 
> > > > So, either you are not triggering that path, or testing that path, but
> > > > sys_remove_file() of the child attributes should be happening *before*
> > > > sysfs_wi_release().
> > > > 
> > > > Did I miss something?
> > > 
> > > I think the missing change is that sysfs_wi_node_add() failures need to
> > > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > > kobject_del() of @wi_group.
> > 
> > Hi Dan Williams
> > 
> > Thank you very much for identifying this potential issue in the code.
> > 
> > As you pointed out, this seems to be a problem that was already present in
> > the original implementation, and I agree that it needs to be addressed.
> > 
> > However, since this issue existed prior to the changes in this patch
> > series, I believe it would be more appropriate to fix it in a separate
> > follow-up patch rather than include it here.
> 
> I tend to disagree. The whole motivation of this series is to get the
> kobject lifetime handling correct in order to add the new dynamic
> capability. The claimed correctness fixups are incomplete. There is time
> to respin this (we are only at -rc1) and get it right before landing the
> new dynamic capability.
> 
> One of the outcomes of the "MM Process" topic at LSF/MM was that Andrew
> wanted more feedback on when patches are not quite ready for prime-time
> and I think this is an example of a patch set that deserves another spin
> to meet the stated goals.
> 
> > I will start preparing a new patch to address this problem, and I would
> > greatly appreciate it if you could review it once it's ready.
> 
> Will definitely review it. I will leave to Andrew if he wants an
> incremental fixup on top of this series, or rebase on top of a fully
> fixed baseline. My preference is finish fixing all the old kobject()
> issues and then rebase the new dynamic work on top. Either way, do not
> be afraid to ask Andrew to replace a series in -mm, that's a sign of the
> process working as expected.

Thank you very much for your advice, and I completely agree with your
recommendation. I will immediately ask Andrew to remove this patch series
from -mm. Then, I will prepare a new version, v8, which properly addresses
the kobject-related issues you pointed out.

Once again, I sincerely appreciate your thoughtful and detailed feedback.

Rakie
Rakie Kim April 10, 2025, 8:06 a.m. UTC | #7
On Thu, 10 Apr 2025 16:53:33 +0900 Rakie Kim <rakie.kim@sk.com> wrote:
> On Wed, 9 Apr 2025 11:51:36 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > Rakie Kim wrote:
> > > > > > +static void sysfs_wi_node_delete(int nid)
> > > > > >  {
> > > > > > -	if (!node_attr)
> > > > > > +	if (!wi_group->nattrs[nid])
> > > > > >  		return;
> > > > > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > > > -	kfree(node_attr->kobj_attr.attr.name);
> > > > > > -	kfree(node_attr);
> > > > > > +
> > > > > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > > > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > > > > 
> > > > > This still looks broken to me, but I think this is more a problem that
> > > > > was present in the original code.
> > > > > 
> > > > > At this point @wi_group's reference count is zero because
> > > > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > > > properly transitioned through kobject_del() and final kobject_put(). It
> > > > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > > > for the !parent case.
> > > > > 
> > > > > So, either you are not triggering that path, or testing that path, but
> > > > > sys_remove_file() of the child attributes should be happening *before*
> > > > > sysfs_wi_release().
> > > > > 
> > > > > Did I miss something?
> > > > 
> > > > I think the missing change is that sysfs_wi_node_add() failures need to
> > > > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > > > kobject_del() of @wi_group.
> > > 
> > > Hi Dan Williams
> > > 
> > > Thank you very much for identifying this potential issue in the code.
> > > 
> > > As you pointed out, this seems to be a problem that was already present in
> > > the original implementation, and I agree that it needs to be addressed.
> > > 
> > > However, since this issue existed prior to the changes in this patch
> > > series, I believe it would be more appropriate to fix it in a separate
> > > follow-up patch rather than include it here.
> > 
> > I tend to disagree. The whole motivation of this series is to get the
> > kobject lifetime handling correct in order to add the new dynamic
> > capability. The claimed correctness fixups are incomplete. There is time
> > to respin this (we are only at -rc1) and get it right before landing the
> > new dynamic capability.
> > 
> > One of the outcomes of the "MM Process" topic at LSF/MM was that Andrew
> > wanted more feedback on when patches are not quite ready for prime-time
> > and I think this is an example of a patch set that deserves another spin
> > to meet the stated goals.
> > 
> > > I will start preparing a new patch to address this problem, and I would
> > > greatly appreciate it if you could review it once it's ready.
> > 
> > Will definitely review it. I will leave to Andrew if he wants an
> > incremental fixup on top of this series, or rebase on top of a fully
> > fixed baseline. My preference is finish fixing all the old kobject()
> > issues and then rebase the new dynamic work on top. Either way, do not
> > be afraid to ask Andrew to replace a series in -mm, that's a sign of the
> > process working as expected.
> 
> Thank you very much for your advice, and I completely agree with your
> recommendation. I will immediately ask Andrew to remove this patch series
> from -mm. Then, I will prepare a new version, v8, which properly addresses
> the kobject-related issues you pointed out.
> 
> Once again, I sincerely appreciate your thoughtful and detailed feedback.
> 
> Rakie
> 

To Andrew

I sincerely apologize for the inconvenience. It appears that this commit still
requires additional corrections. I would appreciate it if you could drop the
changes you merged into -mm, mm-new branch yesterday.

<1>
The patch titled
     Subject: mm/mempolicy: fix memory leaks in weighted interleave sysfs has been added to the -mm mm-new branch.  Its filename is
     mm-mempolicy-fix-memory-leaks-in-weighted-interleave-sysfs.patch
This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-mempolicy-fix-memory-leaks-in-weighted-interleave-sysfs.patch

<2>
The patch titled
     Subject: mm/mempolicy: prepare weighted interleave sysfs for memory hotplug has been added to the -mm mm-new branch.  Its filename is
     mm-mempolicy-prepare-weighted-interleave-sysfs-for-memory-hotplug.patch
This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-mempolicy-prepare-weighted-interleave-sysfs-for-memory-hotplug.patch

<3>
The patch titled
     Subject: mm/mempolicy: support memory hotplug in weighted interleave has been added to the -mm mm-new branch.  Its filename is
     mm-mempolicy-support-memory-hotplug-in-weighted-interleave.patch
This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-mempolicy-support-memory-hotplug-in-weighted-interleave.patch

Rakie
Andrew Morton April 11, 2025, 3:11 a.m. UTC | #8
On Thu, 10 Apr 2025 17:06:19 +0900 Rakie Kim <rakie.kim@sk.com> wrote:

> I sincerely apologize for the inconvenience. It appears that this commit still
> requires additional corrections. I would appreciate it if you could drop the
> changes you merged into -mm, mm-new branch yesterday.

No problems, it happens, glad to be of service.  Dropped.
Rakie Kim April 11, 2025, 7:21 a.m. UTC | #9
On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> Dan Williams wrote:
> > >  
> > > +struct sysfs_wi_group {
> > > +	struct kobject wi_kobj;
> > > +	struct iw_node_attr *nattrs[];
> > > +};
> > > +
> > > +static struct sysfs_wi_group *wi_group;
> > > +
> > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > >  			 char *buf)
> > >  {
> > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > >  	return count;
> > >  }
> > >  
> > > -static struct iw_node_attr **node_attrs;
> > > -
> > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > -				  struct kobject *parent)
> > > +static void sysfs_wi_node_delete(int nid)
> > >  {
> > > -	if (!node_attr)
> > > +	if (!wi_group->nattrs[nid])
> > >  		return;
> > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > -	kfree(node_attr->kobj_attr.attr.name);
> > > -	kfree(node_attr);
> > > +
> > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > 
> > This still looks broken to me, but I think this is more a problem that
> > was present in the original code.
> > 
> > At this point @wi_group's reference count is zero because
> > sysfs_wi_release() has been called. However, it can only be zero if it has
> > properly transitioned through kobject_del() and final kobject_put(). It
> > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > for the !parent case.
> > 
> > So, either you are not triggering that path, or testing that path, but
> > sys_remove_file() of the child attributes should be happening *before*
> > sysfs_wi_release().
> > 
> > Did I miss something?
> 
> I think the missing change is that sysfs_wi_node_add() failures need to
> be done with a sysfs_wi_node_delete() of the added attrs *before* the
> kobject_del() of @wi_group.

Hi Dan,

Thank you for pointing out this issue.

As you suggested, I believe the most appropriate way to handle this is
to incorporate your feedback into Patch 1 
(mm/mempolicy: Fix memory leaks in weighted interleave sysfs).

To ensure that sysfs_remove_file() is called before kobject_del(), I
have restructured the code as follows:

<Previously>
static void sysfs_wi_release(struct kobject *wi_kobj)
{
	int nid;

	for (nid = 0; nid < nr_node_ids; nid++)
		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
		-> ERROR: sysfs_remove_file called here
	kfree(node_attrs);
	kfree(wi_kobj);
}

<Now>
static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
{
	int nid;

	for (nid = 0; nid < nr_node_ids; nid++)
		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
		-> sysfs_remove_file called here
}

static void sysfs_wi_release(struct kobject *wi_kobj)
{
	kfree(node_attrs);
	kfree(wi_kobj);
}

In addition, I call sysfs_wi_node_delete_all() before kobject_del()
during error handling:

+err_cleanup_kobj:
+	sysfs_wi_node_delete_all(wi_kobj);
	kobject_del(wi_kobj);

I believe this resolves the issue you raised.

That said, I have a follow-up question. With this structure, when the
system is shutting down, sysfs_remove_file() will not be called. Based
on my review of other kernel subsystems, it seems that sysfs_remove_file()
is only called during module_exit() in driver code, and not in other
built-in subsystems.

Is this an acceptable practice? If you happen to know the expected
behavior in such cases, I would appreciate your insights.

Below is the full content of the updated Patch 1.
@@ -3463,8 +3463,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 static struct iw_node_attr **node_attrs;
 
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
-                                 struct kobject *parent)
+static void sysfs_wi_node_delete(struct iw_node_attr *node_attr,
+                                struct kobject *parent)
 {
        if (!node_attr)
                return;
@@ -3473,13 +3473,16 @@ static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
        kfree(node_attr);
 }
 
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
 {
-       int i;
+       int nid;
 
-       for (i = 0; i < nr_node_ids; i++)
-               sysfs_wi_node_release(node_attrs[i], wi_kobj);
+       for (nid = 0; nid < nr_node_ids; nid++)
+               sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
+}
 
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
        kfree(node_attrs);
        kfree(wi_kobj);
 }
@@ -3547,13 +3550,14 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
                err = add_weight_node(nid, wi_kobj);
                if (err) {
                        pr_err("failed to add sysfs [node%d]\n", nid);
-                       goto err_del_kobj;
+                       goto err_cleanup_kobj;
                }
        }
 
        return 0;
 
-err_del_kobj:
+err_cleanup_kobj:
+       sysfs_wi_node_delete_all(wi_kobj);
        kobject_del(wi_kobj);
 err_put_kobj:
        kobject_put(wi_kobj);

Thank you again for your helpful feedback.

Rakie
Dan Williams April 11, 2025, 10:24 p.m. UTC | #10
Rakie Kim wrote:
> On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > Dan Williams wrote:
> > > >  
> > > > +struct sysfs_wi_group {
> > > > +	struct kobject wi_kobj;
> > > > +	struct iw_node_attr *nattrs[];
> > > > +};
> > > > +
> > > > +static struct sysfs_wi_group *wi_group;
> > > > +
> > > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  			 char *buf)
> > > >  {
> > > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  	return count;
> > > >  }
> > > >  
> > > > -static struct iw_node_attr **node_attrs;
> > > > -
> > > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > > -				  struct kobject *parent)
> > > > +static void sysfs_wi_node_delete(int nid)
> > > >  {
> > > > -	if (!node_attr)
> > > > +	if (!wi_group->nattrs[nid])
> > > >  		return;
> > > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > -	kfree(node_attr->kobj_attr.attr.name);
> > > > -	kfree(node_attr);
> > > > +
> > > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > > 
> > > This still looks broken to me, but I think this is more a problem that
> > > was present in the original code.
> > > 
> > > At this point @wi_group's reference count is zero because
> > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > properly transitioned through kobject_del() and final kobject_put(). It
> > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > for the !parent case.
> > > 
> > > So, either you are not triggering that path, or testing that path, but
> > > sys_remove_file() of the child attributes should be happening *before*
> > > sysfs_wi_release().
> > > 
> > > Did I miss something?
> > 
> > I think the missing change is that sysfs_wi_node_add() failures need to
> > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > kobject_del() of @wi_group.
> 
> Hi Dan,
> 
> Thank you for pointing out this issue.
> 
> As you suggested, I believe the most appropriate way to handle this is
> to incorporate your feedback into Patch 1 
> (mm/mempolicy: Fix memory leaks in weighted interleave sysfs).
> 
> To ensure that sysfs_remove_file() is called before kobject_del(), I
> have restructured the code as follows:
> 
> <Previously>
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> 	int nid;
> 
> 	for (nid = 0; nid < nr_node_ids; nid++)
> 		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> 		-> ERROR: sysfs_remove_file called here
> 	kfree(node_attrs);
> 	kfree(wi_kobj);
> }
> 
> <Now>
> static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
> {
> 	int nid;
> 
> 	for (nid = 0; nid < nr_node_ids; nid++)
> 		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);

At this point the nodes were live which means userspace could have
triggered an iw_table update. So I would expect that after all node
files have been deleted then this function frees the iw_table.

> 		-> sysfs_remove_file called here

Call iw_table_free() after the loop, where that is something like below
(untested!):

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..88538f23c7d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3430,6 +3430,28 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
 	return sysfs_emit(buf, "%d\n", weight);
 }
 
+static void iw_table_install(static u8 *new, struct iw_node_attr *node_attr, u8 weight)
+{
+	u8 *old;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old && new)
+		memcpy(new, old, nr_node_ids);
+	if (new)
+		new[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+}
+
+static void iw_table_free(void)
+{
+	iw_table_install(NULL, NULL, 0);
+}
+
 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 			  const char *buf, size_t count)
 {
@@ -3447,17 +3469,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 	new = kzalloc(nr_node_ids, GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
+	iw_table_install(new, node_attr, weight);
 
-	mutex_lock(&iw_table_lock);
-	old = rcu_dereference_protected(iw_table,
-					lockdep_is_held(&iw_table_lock));
-	if (old)
-		memcpy(new, old, nr_node_ids);
-	new[node_attr->nid] = weight;
-	rcu_assign_pointer(iw_table, new);
-	mutex_unlock(&iw_table_lock);
-	synchronize_rcu();
-	kfree(old);
 	return count;
 }
 
@@ -3550,15 +3563,6 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
 
 static void mempolicy_kobj_release(struct kobject *kobj)
 {
-	u8 *old;
-
-	mutex_lock(&iw_table_lock);
-	old = rcu_dereference_protected(iw_table,
-					lockdep_is_held(&iw_table_lock));
-	rcu_assign_pointer(iw_table, NULL);
-	mutex_unlock(&iw_table_lock);
-	synchronize_rcu();
-	kfree(old);
 	kfree(node_attrs);
 	kfree(kobj);
 }

> }
> 
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> 	kfree(node_attrs);
> 	kfree(wi_kobj);
> }
> 
> In addition, I call sysfs_wi_node_delete_all() before kobject_del()
> during error handling:
> 
> +err_cleanup_kobj:
> +	sysfs_wi_node_delete_all(wi_kobj);
> 	kobject_del(wi_kobj);
> 
> I believe this resolves the issue you raised.

Yes, along with the iw_table_free() change because while it is not a
leak, it is awkward that mempolicy_kobj_release arranges to keep
iw_table allocated long past the time the node attributes have been
deleted and shutdown in sysfs.

> That said, I have a follow-up question. With this structure, when the
> system is shutting down, sysfs_remove_file() will not be called. Based
> on my review of other kernel subsystems, it seems that sysfs_remove_file()
> is only called during module_exit() in driver code, and not in other
> built-in subsystems.

Correct.

> Is this an acceptable practice? If you happen to know the expected
> behavior in such cases, I would appreciate your insights.

Yes, there are plenty of examples of sysfs infrastructure that gets set
up, but never torn down for the life of the kernel. The goal here is to
make the error unwind path correct and make the code clean for potentially
deleting mempolicy_kobj infrastructure in the future, but it is
otherwise ok if the only patch that calls kobject_del() for an object is
the error unwind path.

> 
> Below is the full content of the updated Patch 1.
> @@ -3463,8 +3463,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>  
>  static struct iw_node_attr **node_attrs;
>  
> -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> -                                 struct kobject *parent)
> +static void sysfs_wi_node_delete(struct iw_node_attr *node_attr,
> +                                struct kobject *parent)
>  {
>         if (!node_attr)
>                 return;
> @@ -3473,13 +3473,16 @@ static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
>         kfree(node_attr);
>  }
>  
> -static void sysfs_wi_release(struct kobject *wi_kobj)
> +static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
>  {
> -       int i;
> +       int nid;
>  
> -       for (i = 0; i < nr_node_ids; i++)
> -               sysfs_wi_node_release(node_attrs[i], wi_kobj);
> +       for (nid = 0; nid < nr_node_ids; nid++)
> +               sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> +}
>  
> +static void sysfs_wi_release(struct kobject *wi_kobj)
> +{
>         kfree(node_attrs);
>         kfree(wi_kobj);
>  }
> @@ -3547,13 +3550,14 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
>                 err = add_weight_node(nid, wi_kobj);
>                 if (err) {
>                         pr_err("failed to add sysfs [node%d]\n", nid);
> -                       goto err_del_kobj;
> +                       goto err_cleanup_kobj;
>                 }
>         }
>  
>         return 0;
>  
> -err_del_kobj:
> +err_cleanup_kobj:
> +       sysfs_wi_node_delete_all(wi_kobj);
>         kobject_del(wi_kobj);
>  err_put_kobj:
>         kobject_put(wi_kobj);
> 
> Thank you again for your helpful feedback.

Hey, thanks for the patience to get this all fixed up properly.
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0da102aa1cfc..988575f29c53 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3419,6 +3419,13 @@  struct iw_node_attr {
 	int nid;
 };
 
+struct sysfs_wi_group {
+	struct kobject wi_kobj;
+	struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
 			 char *buf)
 {
@@ -3461,27 +3468,24 @@  static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 	return count;
 }
 
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
-				  struct kobject *parent)
+static void sysfs_wi_node_delete(int nid)
 {
-	if (!node_attr)
+	if (!wi_group->nattrs[nid])
 		return;
-	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
-	kfree(node_attr->kobj_attr.attr.name);
-	kfree(node_attr);
+
+	sysfs_remove_file(&wi_group->wi_kobj,
+			  &wi_group->nattrs[nid]->kobj_attr.attr);
+	kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
+	kfree(wi_group->nattrs[nid]);
 }
 
 static void sysfs_wi_release(struct kobject *wi_kobj)
 {
-	int i;
-
-	for (i = 0; i < nr_node_ids; i++)
-		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	int nid;
 
-	kfree(node_attrs);
-	kfree(wi_kobj);
+	for (nid = 0; nid < nr_node_ids; nid++)
+		sysfs_wi_node_delete(nid);
+	kfree(wi_group);
 }
 
 static const struct kobj_type wi_ktype = {
@@ -3489,7 +3493,7 @@  static const struct kobj_type wi_ktype = {
 	.release = sysfs_wi_release,
 };
 
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
 {
 	struct iw_node_attr *node_attr;
 	char *name;
@@ -3511,40 +3515,33 @@  static int add_weight_node(int nid, struct kobject *wi_kobj)
 	node_attr->kobj_attr.store = node_store;
 	node_attr->nid = nid;
 
-	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+	if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) {
 		kfree(node_attr->kobj_attr.attr.name);
 		kfree(node_attr);
 		pr_err("failed to add attribute to weighted_interleave\n");
 		return -ENOMEM;
 	}
 
-	node_attrs[nid] = node_attr;
+	wi_group->nattrs[nid] = node_attr;
 	return 0;
 }
 
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
 {
-	struct kobject *wi_kobj;
 	int nid, err;
 
-	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
-			     GFP_KERNEL);
-	if (!node_attrs)
+	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+			   GFP_KERNEL);
+	if (!wi_group)
 		return -ENOMEM;
 
-	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
-	if (!wi_kobj) {
-		kfree(node_attrs);
-		return -ENOMEM;
-	}
-
-	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
 				   "weighted_interleave");
 	if (err)
 		goto err_put_kobj;
 
 	for_each_node_state(nid, N_POSSIBLE) {
-		err = add_weight_node(nid, wi_kobj);
+		err = sysfs_wi_node_add(nid);
 		if (err) {
 			pr_err("failed to add sysfs [node%d]\n", nid);
 			goto err_del_kobj;
@@ -3554,9 +3551,9 @@  static int add_weighted_interleave_group(struct kobject *root_kobj)
 	return 0;
 
 err_del_kobj:
-	kobject_del(wi_kobj);
+	kobject_del(&wi_group->wi_kobj);
 err_put_kobj:
-	kobject_put(wi_kobj);
+	kobject_put(&wi_group->wi_kobj);
 	return err;
 }