diff mbox series

[1/3] mm/mempolicy: implement the sysfs-based weighted_interleave interface

Message ID 20240112210834.8035-2-gregory.price@memverge.com (mailing list archive)
State New
Headers show
Series mm/mempolicy: weighted interleave mempolicy with sysfs extension | expand

Commit Message

Gregory Price Jan. 12, 2024, 9:08 p.m. UTC
From: Rakie Kim <rakie.kim@sk.com>

This patch provides a way to set interleave weight information under
sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN

The sysfs structure is designed as follows.

  $ tree /sys/kernel/mm/mempolicy/
  /sys/kernel/mm/mempolicy/ [1]
  └── weighted_interleave [2]
      ├── node0 [3]
      └── node1

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] weighted_interleave/: config interface for weighted interleave policy

[3] weighted_interleave/nodeN: weight for nodeN

Internally, there is a secondary table `default_iw_table`, which holds
kernel-internal default interleave weights for each possible node.

If the value for a node is set to `0`, the default value will be used.

If sysfs is disabled in the config, interleave weights will default
to use `default_iw_table`.

Suggested-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
---
 .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
 ...fs-kernel-mm-mempolicy-weighted-interleave |  26 ++
 mm/mempolicy.c                                | 251 ++++++++++++++++++
 3 files changed, 281 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

Comments

Huang, Ying Jan. 15, 2024, 3:18 a.m. UTC | #1
Gregory Price <gourry.memverge@gmail.com> writes:

> From: Rakie Kim <rakie.kim@sk.com>
>
> This patch provides a way to set interleave weight information under
> sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
>
> The sysfs structure is designed as follows.
>
>   $ tree /sys/kernel/mm/mempolicy/
>   /sys/kernel/mm/mempolicy/ [1]
>   └── weighted_interleave [2]
>       ├── node0 [3]
>       └── node1
>
> Each file above can be explained as follows.
>
> [1] mm/mempolicy: configuration interface for mempolicy subsystem
>
> [2] weighted_interleave/: config interface for weighted interleave policy
>
> [3] weighted_interleave/nodeN: weight for nodeN
>
> Internally, there is a secondary table `default_iw_table`, which holds
> kernel-internal default interleave weights for each possible node.
>
> If the value for a node is set to `0`, the default value will be used.
>
> If sysfs is disabled in the config, interleave weights will default
> to use `default_iw_table`.
>
> Suggested-by: Huang Ying <ying.huang@intel.com>
> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> Co-developed-by: Gregory Price <gregory.price@memverge.com>
> Signed-off-by: Gregory Price <gregory.price@memverge.com>
> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> ---
>  .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
>  ...fs-kernel-mm-mempolicy-weighted-interleave |  26 ++
>  mm/mempolicy.c                                | 251 ++++++++++++++++++
>  3 files changed, 281 insertions(+)
>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
> new file mode 100644
> index 000000000000..2dcf24f4384a
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
> @@ -0,0 +1,4 @@
> +What:		/sys/kernel/mm/mempolicy/
> +Date:		December 2023
> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Interface for Mempolicy
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
> new file mode 100644
> index 000000000000..e6a38139bf0f
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
> @@ -0,0 +1,26 @@
> +What:		/sys/kernel/mm/mempolicy/weighted_interleave/
> +Date:		December 2023
> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Configuration Interface for the Weighted Interleave policy
> +
> +What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
> +Date:		December 2023
> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Weight configuration interface for nodeN
> +
> +		The interleave weight for a memory node (N). These weights are
> +		utilized by processes which have set their mempolicy to
> +		MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
> +		omitting a task-local weight array.
> +
> +		These weights only affect new allocations, and changes at runtime
> +		will not cause migrations on already allocated pages.
> +
> +		The minimum weight for a node is always 1.
> +
> +		Minimum weight: 1
> +		Maximum weight: 255
> +
> +		Writing an empty string or `0` will reset the weight to the
> +		system default. The system default may be set by the kernel
> +		or drivers at boot or during hotplug events.
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 10a590ee1c89..5da4fd79fd18 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -131,6 +131,30 @@ static struct mempolicy default_policy = {
>  
>  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
>  
> +struct iw_table {
> +	struct rcu_head rcu;
> +	u8 weights[MAX_NUMNODES];
> +};
> +/*
> + * default_iw_table is the kernel-internal default value interleave
> + * weight table. It is to be set by driver code capable of reading
> + * HMAT/CDAT information, and to provide mempolicy a sane set of
> + * default weight values for WEIGHTED_INTERLEAVE mode.
> + *
> + * By default, prior to HMAT/CDAT information being consumed, the
> + * default weight of all nodes is 1.  The default weight of any
> + * node can only be in the range 1-255. A 0-weight is not allowed.
> + */
> +static struct iw_table default_iw_table;
> +/*
> + * iw_table is the sysfs-set interleave weight table, a value of 0
> + * denotes that the default_iw_table value should be used.
> + *
> + * iw_table is RCU protected
> + */
> +static struct iw_table __rcu *iw_table;
> +static DEFINE_MUTEX(iw_table_mtx);

I greped "mtx" in kernel/*.c and mm/*.c and found nothing.  To following
the existing coding convention, better to name this as iw_table_mutex or
iw_table_lock?

And, I think this is used to protect both iw_table and default_iw_table?
If so, it deserves some comments.

> +
>  /**
>   * numa_nearest_node - Find nearest node by state
>   * @node: Node id to start the search
> @@ -3067,3 +3091,230 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
>  		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
>  			       nodemask_pr_args(&nodes));
>  }
> +
> +#ifdef CONFIG_SYSFS
> +struct iw_node_attr {
> +	struct kobj_attribute kobj_attr;
> +	int nid;
> +};
> +
> +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> +			 char *buf)
> +{
> +	struct iw_node_attr *node_attr;
> +	u8 weight;
> +	struct iw_table __rcu *table;
> +
> +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> +
> +	rcu_read_lock();
> +	table = rcu_dereference(iw_table);
> +	weight = table->weights[node_attr->nid];
> +	rcu_read_unlock();
> +
> +	return sysfs_emit(buf, "%d\n", weight);
> +}
> +
> +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> +			  const char *buf, size_t count)
> +{
> +	struct iw_node_attr *node_attr;
> +	struct iw_table __rcu *new;
> +	struct iw_table __rcu *old;
> +	u8 weight = 0;
> +
> +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> +	if (count == 0 || sysfs_streq(buf, ""))
> +		weight = 0;
> +	else if (kstrtou8(buf, 0, &weight))
> +		return -EINVAL;
> +
> +	new = kmalloc(sizeof(*new), GFP_KERNEL);
> +	if (!new)
> +		return -ENOMEM;
> +
> +	mutex_lock(&iw_table_mtx);
> +	old = rcu_dereference_protected(iw_table,
> +					lockdep_is_held(&iw_table_mtx));
> +	/* If value is 0, revert to default weight */
> +	weight = weight ? weight : default_iw_table.weights[node_attr->nid];

If we change the default weight in default_iw_table.weights[], how do we
identify whether the weight has been customized by users via sysfs?  So,
I suggest to use 0 in iw_table for not-customized weight.

And if so, we need to use RCU to access default_iw_table too.

> +	memcpy(&new->weights, &old->weights, sizeof(new->weights));
> +	new->weights[node_attr->nid] = weight;
> +	rcu_assign_pointer(iw_table, new);
> +	mutex_unlock(&iw_table_mtx);
> +	kfree_rcu(old, rcu);

synchronize_rcu() should be OK here.  It's fast enough in this cold
path.  This make it good to define iw_table as

u8 __rcu *iw_table;

Then, we only need to allocate nr_node_ids elements now.

> +	return count;
> +}
> +
> +static struct iw_node_attr *node_attrs[MAX_NUMNODES];
> +
> +static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> +				  struct kobject *parent)
> +{
> +	if (!node_attr)
> +		return;
> +	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> +	kfree(node_attr->kobj_attr.attr.name);
> +	kfree(node_attr);
> +}
> +
> +static void sysfs_wi_release(struct kobject *wi_kobj)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_NUMNODES; i++)
> +		sysfs_wi_node_release(node_attrs[i], wi_kobj);
> +	kobject_put(wi_kobj);
> +}
> +
> +static const struct kobj_type wi_ktype = {
> +	.sysfs_ops = &kobj_sysfs_ops,
> +	.release = sysfs_wi_release,
> +};
> +
> +static int add_weight_node(int nid, struct kobject *wi_kobj)
> +{
> +	struct iw_node_attr *node_attr;
> +	char *name;
> +
> +	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
> +	if (!node_attr)
> +		return -ENOMEM;
> +
> +	name = kasprintf(GFP_KERNEL, "node%d", nid);
> +	if (!name) {
> +		kfree(node_attr);
> +		return -ENOMEM;
> +	}
> +
> +	sysfs_attr_init(&node_attr->kobj_attr.attr);
> +	node_attr->kobj_attr.attr.name = name;
> +	node_attr->kobj_attr.attr.mode = 0644;
> +	node_attr->kobj_attr.show = node_show;
> +	node_attr->kobj_attr.store = node_store;
> +	node_attr->nid = nid;
> +
> +	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
> +		kfree(node_attr->kobj_attr.attr.name);
> +		kfree(node_attr);
> +		pr_err("failed to add attribute to weighted_interleave\n");
> +		return -ENOMEM;
> +	}
> +
> +	node_attrs[nid] = node_attr;
> +	return 0;
> +}
> +
> +static int add_weighted_interleave_group(struct kobject *root_kobj)
> +{
> +	struct kobject *wi_kobj;
> +	int nid, err;
> +
> +	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> +	if (!wi_kobj)
> +		return -ENOMEM;
> +
> +	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
> +				   "weighted_interleave");
> +	if (err) {
> +		kfree(wi_kobj);
> +		return err;
> +	}
> +
> +	memset(node_attrs, 0, sizeof(node_attrs));
> +	for_each_node_state(nid, N_POSSIBLE) {
> +		err = add_weight_node(nid, wi_kobj);
> +		if (err) {
> +			pr_err("failed to add sysfs [node%d]\n", nid);
> +			break;
> +		}
> +	}
> +	if (err)
> +		kobject_put(wi_kobj);
> +	return 0;
> +}
> +
> +static void mempolicy_kobj_release(struct kobject *kobj)
> +{
> +	if (iw_table != &default_iw_table)
> +		kfree(iw_table);
> +	kfree(kobj);
> +}
> +
> +static const struct kobj_type mempolicy_ktype = {
> +	.release = mempolicy_kobj_release
> +};
> +
> +static struct kobject *mempolicy_kobj;
> +static int __init mempolicy_sysfs_init(void)
> +{
> +	int err;
> +	struct kobject *mempolicy_kobj;
> +	struct iw_table __rcu *table = NULL;
> +
> +	/*
> +	 * If sysfs setup fails, utilize the default weight table
> +	 * This at least allows mempolicy to continue functioning safely.
> +	 */
> +	memset(&default_iw_table.weights, 1, MAX_NUMNODES);
> +	iw_table = &default_iw_table;
> +
> +	table = kzalloc(sizeof(struct iw_table), GFP_KERNEL);
> +	if (!table)
> +		return -ENOMEM;
> +
> +	memcpy(&table->weights, default_iw_table.weights,
> +	       sizeof(table->weights));
> +
> +	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
> +	if (!mempolicy_kobj) {
> +		kfree(table);
> +		pr_err("failed to add mempolicy kobject to the system\n");
> +		return -ENOMEM;
> +	}
> +	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
> +				   "mempolicy");
> +	if (err) {
> +		kfree(table);
> +		kfree(mempolicy_kobj);
> +		return err;
> +	}
> +
> +	err = add_weighted_interleave_group(mempolicy_kobj);
> +
> +	if (err) {
> +		kobject_put(mempolicy_kobj);
> +		return err;
> +	}
> +
> +	iw_table = table;
> +	return err;
> +}
> +
> +static void __exit mempolicy_exit(void)
> +{
> +	if (mempolicy_kobj)
> +		kobject_put(mempolicy_kobj);
> +}
> +
> +#else
> +static int __init mempolicy_sysfs_init(void)
> +{
> +	/*
> +	 * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to
> +	 * MPOL_INTERLEAVE behavior, but is still defined separately to
> +	 * allow task-local weighted interleave and system-defaults to
> +	 * operate as intended.
> +	 *
> +	 * In this scenario iw_table cannot (presently) change, so
> +	 * there's no need to set up RCU / cleanup code.
> +	 */
> +	memset(&default_iw_table.weights, 1, sizeof(default_iw_table));

This depends on sizeof(default_iw_table.weights[0]) == 1, I think it's
better to use explicit loop here to make the code more robust a little.

> +	iw_table = default_iw_table;

iw_table = &default_iw_table;

?

> +	return 0;
> +}
> +
> +static void __exit mempolicy_exit(void) { }
> +#endif /* CONFIG_SYSFS */
> +late_initcall(mempolicy_sysfs_init);
> +module_exit(mempolicy_exit);

--
Best Regards,
Huang, Ying
Gregory Price Jan. 17, 2024, 5:24 a.m. UTC | #2
On Mon, Jan 15, 2024 at 11:18:00AM +0800, Huang, Ying wrote:
> Gregory Price <gourry.memverge@gmail.com> writes:
> 
> > +static struct iw_table default_iw_table;
> > +/*
> > + * iw_table is the sysfs-set interleave weight table, a value of 0
> > + * denotes that the default_iw_table value should be used.
> > + *
> > + * iw_table is RCU protected
> > + */
> > +static struct iw_table __rcu *iw_table;
> > +static DEFINE_MUTEX(iw_table_mtx);
> 
> I greped "mtx" in kernel/*.c and mm/*.c and found nothing.  To following
> the existing coding convention, better to name this as iw_table_mutex or
> iw_table_lock?
> 

ack.

> And, I think this is used to protect both iw_table and default_iw_table?
> If so, it deserves some comments.
> 

Right now default_iw_table cannot be updated, and so it is neither
protected nor requires protection.

I planned to add the protection comment in the next patch series, which
would implement the kernel-side interface for updating the default
weights during boot/hotplug.

We haven't had the discussion on how/when this should happen yet,
though, and there's some research to be done.  (i.e. when should DRAM
weights be set? should the entire table be reweighted on hotplug? etc)

> > +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > +			  const char *buf, size_t count)
> > +{
> > +	struct iw_node_attr *node_attr;
> > +	struct iw_table __rcu *new;
> > +	struct iw_table __rcu *old;
> > +	u8 weight = 0;
> > +
> > +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> > +	if (count == 0 || sysfs_streq(buf, ""))
> > +		weight = 0;
> > +	else if (kstrtou8(buf, 0, &weight))
> > +		return -EINVAL;
> > +
> > +	new = kmalloc(sizeof(*new), GFP_KERNEL);
> > +	if (!new)
> > +		return -ENOMEM;
> > +
> > +	mutex_lock(&iw_table_mtx);
> > +	old = rcu_dereference_protected(iw_table,
> > +					lockdep_is_held(&iw_table_mtx));
> > +	/* If value is 0, revert to default weight */
> > +	weight = weight ? weight : default_iw_table.weights[node_attr->nid];
> 
> If we change the default weight in default_iw_table.weights[], how do we
> identify whether the weight has been customized by users via sysfs?  So,
> I suggest to use 0 in iw_table for not-customized weight.
> 
> And if so, we need to use RCU to access default_iw_table too.
>

Dumb simplification on my part, I'll walk this back and add the 

if (!weight) weight = default_iw_table[node]

logic back into the allocator paths accordinly.

> > +	memcpy(&new->weights, &old->weights, sizeof(new->weights));
> > +	new->weights[node_attr->nid] = weight;
> > +	rcu_assign_pointer(iw_table, new);
> > +	mutex_unlock(&iw_table_mtx);
> > +	kfree_rcu(old, rcu);
> 
> synchronize_rcu() should be OK here.  It's fast enough in this cold
> path.  This make it good to define iw_table as
> 
I'll take a look.

> u8 __rcu *iw_table;
> 
> Then, we only need to allocate nr_node_ids elements now.
> 

We need nr_possible_nodes to handle hotplug correctly.

I decided to simplify this down to MAX_NUMNODES *juuuuuust in case*
"true node hotplug" ever becomes a reality.  If that happens, then
only allocating space for possible nodes creates a much bigger
headache on hotplug.

For the sake of that simplification, it seemed better to just eat the
1KB.  If you really want me to do that, I will, but the MAX_NUMNODES
choice was an explicitly defensive choice.

> > +static int __init mempolicy_sysfs_init(void)
> > +{
> > +	/*
> > +	 * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to
> > +	 * MPOL_INTERLEAVE behavior, but is still defined separately to
> > +	 * allow task-local weighted interleave and system-defaults to
> > +	 * operate as intended.
> > +	 *
> > +	 * In this scenario iw_table cannot (presently) change, so
> > +	 * there's no need to set up RCU / cleanup code.
> > +	 */
> > +	memset(&default_iw_table.weights, 1, sizeof(default_iw_table));
> 
> This depends on sizeof(default_iw_table.weights[0]) == 1, I think it's
> better to use explicit loop here to make the code more robust a little.
> 

oh hm, you're right.  rookie mistake on my part.

Thanks,
Gregory
Huang, Ying Jan. 17, 2024, 6:58 a.m. UTC | #3
Gregory Price <gregory.price@memverge.com> writes:

> On Mon, Jan 15, 2024 at 11:18:00AM +0800, Huang, Ying wrote:
>> Gregory Price <gourry.memverge@gmail.com> writes:
>> 
>> > +static struct iw_table default_iw_table;
>> > +/*
>> > + * iw_table is the sysfs-set interleave weight table, a value of 0
>> > + * denotes that the default_iw_table value should be used.
>> > + *
>> > + * iw_table is RCU protected
>> > + */
>> > +static struct iw_table __rcu *iw_table;
>> > +static DEFINE_MUTEX(iw_table_mtx);
>> 
>> I greped "mtx" in kernel/*.c and mm/*.c and found nothing.  To following
>> the existing coding convention, better to name this as iw_table_mutex or
>> iw_table_lock?
>> 
>
> ack.
>
>> And, I think this is used to protect both iw_table and default_iw_table?
>> If so, it deserves some comments.
>> 
>
> Right now default_iw_table cannot be updated, and so it is neither
> protected nor requires protection.
>
> I planned to add the protection comment in the next patch series, which
> would implement the kernel-side interface for updating the default
> weights during boot/hotplug.
>
> We haven't had the discussion on how/when this should happen yet,
> though, and there's some research to be done.  (i.e. when should DRAM
> weights be set? should the entire table be reweighted on hotplug? etc)

Before that, I'm OK to remove default_iw_table and use hard coded "1" as
default weight for now.

>> > +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>> > +			  const char *buf, size_t count)
>> > +{
>> > +	struct iw_node_attr *node_attr;
>> > +	struct iw_table __rcu *new;
>> > +	struct iw_table __rcu *old;
>> > +	u8 weight = 0;
>> > +
>> > +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
>> > +	if (count == 0 || sysfs_streq(buf, ""))
>> > +		weight = 0;
>> > +	else if (kstrtou8(buf, 0, &weight))
>> > +		return -EINVAL;
>> > +
>> > +	new = kmalloc(sizeof(*new), GFP_KERNEL);
>> > +	if (!new)
>> > +		return -ENOMEM;
>> > +
>> > +	mutex_lock(&iw_table_mtx);
>> > +	old = rcu_dereference_protected(iw_table,
>> > +					lockdep_is_held(&iw_table_mtx));
>> > +	/* If value is 0, revert to default weight */
>> > +	weight = weight ? weight : default_iw_table.weights[node_attr->nid];
>> 
>> If we change the default weight in default_iw_table.weights[], how do we
>> identify whether the weight has been customized by users via sysfs?  So,
>> I suggest to use 0 in iw_table for not-customized weight.
>> 
>> And if so, we need to use RCU to access default_iw_table too.
>>
>
> Dumb simplification on my part, I'll walk this back and add the 
>
> if (!weight) weight = default_iw_table[node]
>
> logic back into the allocator paths accordinly.
>
>> > +	memcpy(&new->weights, &old->weights, sizeof(new->weights));
>> > +	new->weights[node_attr->nid] = weight;
>> > +	rcu_assign_pointer(iw_table, new);
>> > +	mutex_unlock(&iw_table_mtx);
>> > +	kfree_rcu(old, rcu);
>> 
>> synchronize_rcu() should be OK here.  It's fast enough in this cold
>> path.  This make it good to define iw_table as
>> 
> I'll take a look.
>
>> u8 __rcu *iw_table;
>> 
>> Then, we only need to allocate nr_node_ids elements now.
>> 
>
> We need nr_possible_nodes to handle hotplug correctly.

nr_node_ids >= num_possible_nodes().  It's larger than any possible node
ID.

> I decided to simplify this down to MAX_NUMNODES *juuuuuust in case*
> "true node hotplug" ever becomes a reality.  If that happens, then
> only allocating space for possible nodes creates a much bigger
> headache on hotplug.
>
> For the sake of that simplification, it seemed better to just eat the
> 1KB.  If you really want me to do that, I will, but the MAX_NUMNODES
> choice was an explicitly defensive choice.

When "true node hotplug" becomes reality, we can make nr_node_ids ==
MAX_NUMNODES.  So, it's safe to use it.  Please take a look at
setup_nr_node_ids().

>> > +static int __init mempolicy_sysfs_init(void)
>> > +{
>> > +	/*
>> > +	 * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to
>> > +	 * MPOL_INTERLEAVE behavior, but is still defined separately to
>> > +	 * allow task-local weighted interleave and system-defaults to
>> > +	 * operate as intended.
>> > +	 *
>> > +	 * In this scenario iw_table cannot (presently) change, so
>> > +	 * there's no need to set up RCU / cleanup code.
>> > +	 */
>> > +	memset(&default_iw_table.weights, 1, sizeof(default_iw_table));
>> 
>> This depends on sizeof(default_iw_table.weights[0]) == 1, I think it's
>> better to use explicit loop here to make the code more robust a little.
>> 
>
> oh hm, you're right.  rookie mistake on my part.
>

--
Best Regards,
Huang, Ying
Gregory Price Jan. 17, 2024, 5:46 p.m. UTC | #4
On Wed, Jan 17, 2024 at 02:58:08PM +0800, Huang, Ying wrote:
> Gregory Price <gregory.price@memverge.com> writes:
> 
> > We haven't had the discussion on how/when this should happen yet,
> > though, and there's some research to be done.  (i.e. when should DRAM
> > weights be set? should the entire table be reweighted on hotplug? etc)
> 
> Before that, I'm OK to remove default_iw_table and use hard coded "1" as
> default weight for now.
> 

Can't quite do that. default_iw_table is a static structure because we
need a reliable default structure not subject to module initialization
failure.  Otherwise we can end up in a situation where iw_table is NULL
during some allocation path if the sysfs structure fails to setup fully.

There's no good reason to fail allocations just because sysfs failed to
initialization for some reason.  I'll leave default_iw_table with a size
of MAX_NUMNODES for now (nr_node_ids is set up at runtime per your
reference to `setup_nr_node_ids` below, so we can't use it for this).

> >
> >> u8 __rcu *iw_table;
> >> 
> >> Then, we only need to allocate nr_node_ids elements now.
> >> 
> >
> > We need nr_possible_nodes to handle hotplug correctly.
> 
> nr_node_ids >= num_possible_nodes().  It's larger than any possible node
> ID.
>

nr_node_ids gets setup at runtime, while the default_iw_table needs
to be a static structure (see above).  I can make default_iw_table
MAX_NUMNODES and subsequent allocations of iw_table be nr_node_ids,
but that makes iw_table a different size at any given time.

This *will* break if "true hotplug" ever shows up and possible_nodes !=
MAX_NUMNODES. But I can write it up if it's a sticking point for you.

Ultimately we're squabbling over, at most, about ~3kb of memory, just
keep that in mind. (I guess if you spawn 3000 threads and each tries a
concurrent write to sysfs/node1, you'd eat 3MB view briefly, but that
is a truly degenerate case and I can think of more denegerate things).

> 
> When "true node hotplug" becomes reality, we can make nr_node_ids ==
> MAX_NUMNODES.  So, it's safe to use it.  Please take a look at
> setup_nr_node_ids().
> 

~Gregory
Huang, Ying Jan. 18, 2024, 4:37 a.m. UTC | #5
Gregory Price <gregory.price@memverge.com> writes:

> On Wed, Jan 17, 2024 at 02:58:08PM +0800, Huang, Ying wrote:
>> Gregory Price <gregory.price@memverge.com> writes:
>> 
>> > We haven't had the discussion on how/when this should happen yet,
>> > though, and there's some research to be done.  (i.e. when should DRAM
>> > weights be set? should the entire table be reweighted on hotplug? etc)
>> 
>> Before that, I'm OK to remove default_iw_table and use hard coded "1" as
>> default weight for now.
>> 
>
> Can't quite do that. default_iw_table is a static structure because we
> need a reliable default structure not subject to module initialization
> failure.  Otherwise we can end up in a situation where iw_table is NULL
> during some allocation path if the sysfs structure fails to setup fully.

As the first simplest implementation, we can avoid default_iw_table[].
Becuse it's constant.

> There's no good reason to fail allocations just because sysfs failed to
> initialization for some reason.  I'll leave default_iw_table with a size
> of MAX_NUMNODES for now (nr_node_ids is set up at runtime per your
> reference to `setup_nr_node_ids` below, so we can't use it for this).

We allocate memory during module initialization all over the places in
kernel.  I don't think it will cause any issue in practice.  Just some
additional checking for "default_iw_table == NULL".

And, we cannot make it just static, because we need to use RCU to keep
it consistent.  Otherwise, it may be changed during reading.

>> >
>> >> u8 __rcu *iw_table;
>> >> 
>> >> Then, we only need to allocate nr_node_ids elements now.
>> >> 
>> >
>> > We need nr_possible_nodes to handle hotplug correctly.
>> 
>> nr_node_ids >= num_possible_nodes().  It's larger than any possible node
>> ID.
>>
>
> nr_node_ids gets setup at runtime, while the default_iw_table needs
> to be a static structure (see above).  I can make default_iw_table
> MAX_NUMNODES and subsequent allocations of iw_table be nr_node_ids,
> but that makes iw_table a different size at any given time.
>
> This *will* break if "true hotplug" ever shows up and possible_nodes !=
> MAX_NUMNODES. But I can write it up if it's a sticking point for you.

I don't think it is an issue for "true hotplug".  Because we can set
nr_node_ids = MAX_NUMNODES even if there is something called "true
hotplug".

> Ultimately we're squabbling over, at most, about ~3kb of memory, just
> keep that in mind. (I guess if you spawn 3000 threads and each tries a
> concurrent write to sysfs/node1, you'd eat 3MB view briefly, but that
> is a truly degenerate case and I can think of more denegerate things).

Not just for memory wastage, it's about proper API too.

>> 
>> When "true node hotplug" becomes reality, we can make nr_node_ids ==
>> MAX_NUMNODES.  So, it's safe to use it.  Please take a look at
>> setup_nr_node_ids().
>> 

--
Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..2dcf24f4384a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@ 
+What:		/sys/kernel/mm/mempolicy/
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 000000000000..e6a38139bf0f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,26 @@ 
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Configuration Interface for the Weighted Interleave policy
+
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Weight configuration interface for nodeN
+
+		The interleave weight for a memory node (N). These weights are
+		utilized by processes which have set their mempolicy to
+		MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
+		omitting a task-local weight array.
+
+		These weights only affect new allocations, and changes at runtime
+		will not cause migrations on already allocated pages.
+
+		The minimum weight for a node is always 1.
+
+		Minimum weight: 1
+		Maximum weight: 255
+
+		Writing an empty string or `0` will reset the weight to the
+		system default. The system default may be set by the kernel
+		or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..5da4fd79fd18 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,30 @@  static struct mempolicy default_policy = {
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
+struct iw_table {
+	struct rcu_head rcu;
+	u8 weights[MAX_NUMNODES];
+};
+/*
+ * default_iw_table is the kernel-internal default value interleave
+ * weight table. It is to be set by driver code capable of reading
+ * HMAT/CDAT information, and to provide mempolicy a sane set of
+ * default weight values for WEIGHTED_INTERLEAVE mode.
+ *
+ * By default, prior to HMAT/CDAT information being consumed, the
+ * default weight of all nodes is 1.  The default weight of any
+ * node can only be in the range 1-255. A 0-weight is not allowed.
+ */
+static struct iw_table default_iw_table;
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0
+ * denotes that the default_iw_table value should be used.
+ *
+ * iw_table is RCU protected
+ */
+static struct iw_table __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_mtx);
+
 /**
  * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
@@ -3067,3 +3091,230 @@  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
 			       nodemask_pr_args(&nodes));
 }
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+	struct kobj_attribute kobj_attr;
+	int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct iw_node_attr *node_attr;
+	u8 weight;
+	struct iw_table __rcu *table;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	weight = table->weights[node_attr->nid];
+	rcu_read_unlock();
+
+	return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	struct iw_table __rcu *new;
+	struct iw_table __rcu *old;
+	u8 weight = 0;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	if (count == 0 || sysfs_streq(buf, ""))
+		weight = 0;
+	else if (kstrtou8(buf, 0, &weight))
+		return -EINVAL;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&iw_table_mtx);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_mtx));
+	/* If value is 0, revert to default weight */
+	weight = weight ? weight : default_iw_table.weights[node_attr->nid];
+	memcpy(&new->weights, &old->weights, sizeof(new->weights));
+	new->weights[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_mtx);
+	kfree_rcu(old, rcu);
+	return count;
+}
+
+static struct iw_node_attr *node_attrs[MAX_NUMNODES];
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+				  struct kobject *parent)
+{
+	if (!node_attr)
+		return;
+	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+	kfree(node_attr->kobj_attr.attr.name);
+	kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+	struct iw_node_attr *node_attr;
+	char *name;
+
+	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+	if (!node_attr)
+		return -ENOMEM;
+
+	name = kasprintf(GFP_KERNEL, "node%d", nid);
+	if (!name) {
+		kfree(node_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&node_attr->kobj_attr.attr);
+	node_attr->kobj_attr.attr.name = name;
+	node_attr->kobj_attr.attr.mode = 0644;
+	node_attr->kobj_attr.show = node_show;
+	node_attr->kobj_attr.store = node_store;
+	node_attr->nid = nid;
+
+	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+		kfree(node_attr->kobj_attr.attr.name);
+		kfree(node_attr);
+		pr_err("failed to add attribute to weighted_interleave\n");
+		return -ENOMEM;
+	}
+
+	node_attrs[nid] = node_attr;
+	return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+	struct kobject *wi_kobj;
+	int nid, err;
+
+	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+	if (!wi_kobj)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+				   "weighted_interleave");
+	if (err) {
+		kfree(wi_kobj);
+		return err;
+	}
+
+	memset(node_attrs, 0, sizeof(node_attrs));
+	for_each_node_state(nid, N_POSSIBLE) {
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			break;
+		}
+	}
+	if (err)
+		kobject_put(wi_kobj);
+	return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+	if (iw_table != &default_iw_table)
+		kfree(iw_table);
+	kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+	.release = mempolicy_kobj_release
+};
+
+static struct kobject *mempolicy_kobj;
+static int __init mempolicy_sysfs_init(void)
+{
+	int err;
+	struct kobject *mempolicy_kobj;
+	struct iw_table __rcu *table = NULL;
+
+	/*
+	 * If sysfs setup fails, utilize the default weight table
+	 * This at least allows mempolicy to continue functioning safely.
+	 */
+	memset(&default_iw_table.weights, 1, MAX_NUMNODES);
+	iw_table = &default_iw_table;
+
+	table = kzalloc(sizeof(struct iw_table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	memcpy(&table->weights, default_iw_table.weights,
+	       sizeof(table->weights));
+
+	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+	if (!mempolicy_kobj) {
+		kfree(table);
+		pr_err("failed to add mempolicy kobject to the system\n");
+		return -ENOMEM;
+	}
+	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+				   "mempolicy");
+	if (err) {
+		kfree(table);
+		kfree(mempolicy_kobj);
+		return err;
+	}
+
+	err = add_weighted_interleave_group(mempolicy_kobj);
+
+	if (err) {
+		kobject_put(mempolicy_kobj);
+		return err;
+	}
+
+	iw_table = table;
+	return err;
+}
+
+static void __exit mempolicy_exit(void)
+{
+	if (mempolicy_kobj)
+		kobject_put(mempolicy_kobj);
+}
+
+#else
+static int __init mempolicy_sysfs_init(void)
+{
+	/*
+	 * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to
+	 * MPOL_INTERLEAVE behavior, but is still defined separately to
+	 * allow task-local weighted interleave and system-defaults to
+	 * operate as intended.
+	 *
+	 * In this scenario iw_table cannot (presently) change, so
+	 * there's no need to set up RCU / cleanup code.
+	 */
+	memset(&default_iw_table.weights, 1, sizeof(default_iw_table));
+	iw_table = default_iw_table;
+	return 0;
+}
+
+static void __exit mempolicy_exit(void) { }
+#endif /* CONFIG_SYSFS */
+late_initcall(mempolicy_sysfs_init);
+module_exit(mempolicy_exit);