diff mbox series

[RFC,v4,2/7] mm/demotion: Expose per node memory tier to sysfs

Message ID 20220527122528.129445-3-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V May 27, 2022, 12:25 p.m. UTC
From: Jagdish Gediya <jvgediya@linux.ibm.com>

Add support to read/write the memory tierindex for a NUMA node.

/sys/devices/system/node/nodeN/memtier

where N = node id

When read, It list the memory tier that the node belongs to.

When written, the kernel moves the node into the specified
memory tier, the tier assignment of all other nodes are not
affected.

If the memory tier does not exist, writing to the above file
create the tier and assign the NUMA node to that tier.

mutex memory_tier_lock is introduced to protect memory tier
related chanegs as it can happen from sysfs as well on hot
plug events.

Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 drivers/base/node.c     |  35 ++++++++++++++
 include/linux/migrate.h |   4 +-
 mm/migrate.c            | 103 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)

Comments

Jonathan Cameron May 27, 2022, 2:15 p.m. UTC | #1
On Fri, 27 May 2022 17:55:23 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:

> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> 
> Add support to read/write the memory tierindex for a NUMA node.
> 
> /sys/devices/system/node/nodeN/memtier
> 
> where N = node id
> 
> When read, It list the memory tier that the node belongs to.
> 
> When written, the kernel moves the node into the specified
> memory tier, the tier assignment of all other nodes are not
> affected.
> 
> If the memory tier does not exist, writing to the above file
> create the tier and assign the NUMA node to that tier.
creates

There was some discussion in v2 of Wei Xu's RFC that what matter
for creation is the rank, not the tier number. 

My suggestion is move to an explicit creation file such as
memtier/create_tier_from_rank
to which writing the rank gives results in a new tier
with the next device ID and requested rank.

> 
> mutex memory_tier_lock is introduced to protect memory tier
> related chanegs as it can happen from sysfs as well on hot
> plug events.
> 
> Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  drivers/base/node.c     |  35 ++++++++++++++
>  include/linux/migrate.h |   4 +-
>  mm/migrate.c            | 103 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 141 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index ec8bb24a5a22..cf4a58446d8c 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -20,6 +20,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> +#include <linux/migrate.h>
>  
>  static struct bus_type node_subsys = {
>  	.name = "node",
> @@ -560,11 +561,45 @@ static ssize_t node_read_distance(struct device *dev,
>  }
>  static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>  
> +#ifdef CONFIG_TIERED_MEMORY
> +static ssize_t memtier_show(struct device *dev,
> +			    struct device_attribute *attr,
> +			    char *buf)
> +{
> +	int node = dev->id;
> +
> +	return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
> +}
> +
> +static ssize_t memtier_store(struct device *dev,
> +			     struct device_attribute *attr,
> +			     const char *buf, size_t count)
> +{
> +	unsigned long tier;
> +	int node = dev->id;
> +
> +	int ret = kstrtoul(buf, 10, &tier);
> +	if (ret)
> +		return ret;
> +
> +	ret = node_reset_memory_tier(node, tier);

I don't follow why reset here rather than set.

> +	if (ret)
> +		return ret;
> +
> +	return count;
> +}
> +
> +static DEVICE_ATTR_RW(memtier);
> +#endif
> +
>  static struct attribute *node_dev_attrs[] = {
>  	&dev_attr_meminfo.attr,
>  	&dev_attr_numastat.attr,
>  	&dev_attr_distance.attr,
>  	&dev_attr_vmstat.attr,
> +#ifdef CONFIG_TIERED_MEMORY
> +	&dev_attr_memtier.attr,
> +#endif
>  	NULL
>  };
>  
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 0ec653623565..d37d1d5dee82 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -177,13 +177,15 @@ enum memory_tier_type {
>  };
>  
>  int next_demotion_node(int node);
> -

tidy that up to reduce noise on next version.

>  extern void migrate_on_reclaim_init(void);
>  #ifdef CONFIG_HOTPLUG_CPU
>  extern void set_migration_target_nodes(void);
>  #else
>  static inline void set_migration_target_nodes(void) {}
>  #endif
> +int node_get_memory_tier(int node);
> +int node_set_memory_tier(int node, int tier);
> +int node_reset_memory_tier(int node, int tier);
>  #else
>  #define numa_demotion_enabled	false
>  static inline int next_demotion_node(int node)
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f28ee93fb017..304559ba3372 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2132,6 +2132,7 @@ static struct bus_type memory_tier_subsys = {
>  	.dev_name = "memtier",
>  };
>  
> +DEFINE_MUTEX(memory_tier_lock);
>  static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
>  
>  static ssize_t nodelist_show(struct device *dev,
> @@ -2225,6 +2226,108 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
>  	NULL,
>  };
>  
> +static int __node_get_memory_tier(int node)
> +{
> +	int tier;
> +
> +	for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
> +		if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
> +			return tier;
> +	}
> +
> +	return -1;
> +}
> +
> +int node_get_memory_tier(int node)
> +{
> +	int tier;
> +
> +	/*
> +	 * Make sure memory tier is not unregistered
> +	 * while it is being read.
> +	 */
> +	mutex_lock(&memory_tier_lock);
> +
> +	tier = __node_get_memory_tier(node);
> +
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return tier;
> +}
> +
> +int __node_set_memory_tier(int node, int tier)
> +{
> +	int ret = 0;
> +	/*
> +	 * As register_memory_tier() for new tier can fail,
> +	 * try it before modifying existing tier. register
> +	 * tier makes tier visible in sysfs.
> +	 */
> +	if (!memory_tiers[tier]) {
> +		ret = register_memory_tier(tier);
> +		if (ret) {
> +			goto out;

no brackets around goto out;

It's also pointless, just return ret directly here



> +		}
> +	}
> +
> +	node_set(node, memory_tiers[tier]->nodelist);
> +
> +out:
> +	return ret;
> +}
> +
> +int node_reset_memory_tier(int node, int tier)
> +{
> +	int current_tier, ret = 0;
> +
> +	mutex_lock(&memory_tier_lock);
> +
> +	current_tier = __node_get_memory_tier(node);
> +	if (current_tier == tier)
> +		goto out;
> +
> +	if (current_tier != -1 )
> +		node_clear(node, memory_tiers[current_tier]->nodelist);
> +
> +	ret = __node_set_memory_tier(node, tier);
> +
> +	if (!ret) {
> +		if (nodes_empty(memory_tiers[current_tier]->nodelist))
> +			unregister_memory_tier(current_tier);
flip logic so the error path is out of line.
	if (ret) {
		/* reset..
		ret = ... 
		goto out;
	}	
and have the 'good path' here with less indent.


	if (nodes_empty(memory...

will result in more 'idiomatic' code that is easier for reviewers to
read.  It's Friday afternoon. Don't make me think :)

> +	} else {

> +		/* reset it back to older tier */
> +		ret = __node_set_memory_tier(node, current_tier);
> +	}
> +out:
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return ret;
> +}
> +
> +int node_set_memory_tier(int node, int tier)
> +{
Currently seems to be unused code. Fine if it is used in 
a later patch, but call it out in the patch description.
 
> +	int current_tier, ret = 0;
> +
> +	if (tier >= MAX_MEMORY_TIERS)
> +		return -EINVAL;
> +
> +	mutex_lock(&memory_tier_lock);
> +	current_tier = __node_get_memory_tier(node);
> +	/*
> +	 * if node is already part of the tier proceed with the
> +	 * current tier value, because we might want to establish
> +	 * new migration paths now. The node might be added to a tier
> +	 * before it was made part of N_MEMORY, hence estabilish_migration_targets
> +	 * will have skipped this node.
> +	 */
> +	if (current_tier != -1)
> +		tier = current_tier;
> +	ret = __node_set_memory_tier(node, tier);
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return ret;
> +}
> +
>  /*
>   * node_demotion[] example:
>   *
Aneesh Kumar K.V June 3, 2022, 8:40 a.m. UTC | #2
On 5/27/22 7:45 PM, Jonathan Cameron wrote:
> On Fri, 27 May 2022 17:55:23 +0530
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
> 
>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
>>
>> Add support to read/write the memory tierindex for a NUMA node.
>>
>> /sys/devices/system/node/nodeN/memtier
>>
>> where N = node id
>>
>> When read, It list the memory tier that the node belongs to.
>>
>> When written, the kernel moves the node into the specified
>> memory tier, the tier assignment of all other nodes are not
>> affected.
>>
>> If the memory tier does not exist, writing to the above file
>> create the tier and assign the NUMA node to that tier.
> creates
> 
> There was some discussion in v2 of Wei Xu's RFC that what matter
> for creation is the rank, not the tier number.
> 
> My suggestion is move to an explicit creation file such as
> memtier/create_tier_from_rank
> to which writing the rank gives results in a new tier
> with the next device ID and requested rank.

I think the below workflow is much simpler.

:/sys/devices/system# cat memtier/memtier1/nodelist
1-3
:/sys/devices/system# cat node/node1/memtier
1
:/sys/devices/system# ls memtier/memtier*
nodelist  power  rank  subsystem  uevent
/sys/devices/system# ls memtier/
default_rank  max_tier  memtier1  power  uevent
:/sys/devices/system# echo 2 > node/node1/memtier
:/sys/devices/system#

:/sys/devices/system# ls memtier/
default_rank  max_tier  memtier1  memtier2  power  uevent
:/sys/devices/system# cat memtier/memtier1/nodelist
2-3
:/sys/devices/system# cat memtier/memtier2/nodelist
1
:/sys/devices/system#

ie, to create a tier we just write the tier id/tier index to 
node/nodeN/memtier file. That will create a new memory tier if needed 
and add the node to that specific memory tier. Since for now we are 
having 1:1 mapping between tier index to rank value, we can derive the 
rank value from the memory tier index.

For dynamic memory tier support, we can assign a rank value such that 
new memory tiers are always created such that it comes last in the 
demotion order.

-aneesh
Jonathan Cameron June 6, 2022, 2:59 p.m. UTC | #3
On Fri, 3 Jun 2022 14:10:47 +0530
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:

> On 5/27/22 7:45 PM, Jonathan Cameron wrote:
> > On Fri, 27 May 2022 17:55:23 +0530
> > "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
> >   
> >> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> >>
> >> Add support to read/write the memory tierindex for a NUMA node.
> >>
> >> /sys/devices/system/node/nodeN/memtier
> >>
> >> where N = node id
> >>
> >> When read, It list the memory tier that the node belongs to.
> >>
> >> When written, the kernel moves the node into the specified
> >> memory tier, the tier assignment of all other nodes are not
> >> affected.
> >>
> >> If the memory tier does not exist, writing to the above file
> >> create the tier and assign the NUMA node to that tier.  
> > creates
> > 
> > There was some discussion in v2 of Wei Xu's RFC that what matter
> > for creation is the rank, not the tier number.
> > 
> > My suggestion is move to an explicit creation file such as
> > memtier/create_tier_from_rank
> > to which writing the rank gives results in a new tier
> > with the next device ID and requested rank.  
> 
> I think the below workflow is much simpler.
> 
> :/sys/devices/system# cat memtier/memtier1/nodelist
> 1-3
> :/sys/devices/system# cat node/node1/memtier
> 1
> :/sys/devices/system# ls memtier/memtier*
> nodelist  power  rank  subsystem  uevent
> /sys/devices/system# ls memtier/
> default_rank  max_tier  memtier1  power  uevent
> :/sys/devices/system# echo 2 > node/node1/memtier
> :/sys/devices/system#
> 
> :/sys/devices/system# ls memtier/
> default_rank  max_tier  memtier1  memtier2  power  uevent
> :/sys/devices/system# cat memtier/memtier1/nodelist
> 2-3
> :/sys/devices/system# cat memtier/memtier2/nodelist
> 1
> :/sys/devices/system#
> 
> ie, to create a tier we just write the tier id/tier index to 
> node/nodeN/memtier file. That will create a new memory tier if needed 
> and add the node to that specific memory tier. Since for now we are 
> having 1:1 mapping between tier index to rank value, we can derive the 
> rank value from the memory tier index.
> 
> For dynamic memory tier support, we can assign a rank value such that 
> new memory tiers are always created such that it comes last in the 
> demotion order.

I'm not keen on having to pass through an intermediate state where
the rank may well be wrong, but I guess it's not that harmful even
if it feels wrong ;)

Races are potentially a bit of a pain though depending on what we
expect the usage model to be.

There are patterns (CXL regions for example) of guaranteeing the
'right' device is created by doing something like 

cat create_tier > temp.txt 
#(temp gets 2 for example on first call then
# next read of this file gets 3 etc)

cat temp.txt > create_tier
# will fail if there hasn't been a read of the same value

Assuming all software keeps to the model, then there are no
race conditions over creation.  Otherwise we have two new
devices turn up very close to each other and userspace scripting
tries to create two new tiers - if it races they may end up in
the same tier when that wasn't the intent.  Then code to set
the rank also races and we get two potentially very different
memories in a tier with a randomly selected rank.

Fun and games...  And a fine illustration why sysfs based 'device'
creation is tricky to get right (and lots of cases in the kernel
don't).

Jonathan


> 
> -aneesh
> 
> 
> 
>
Aneesh Kumar K.V June 6, 2022, 4:01 p.m. UTC | #4
On 6/6/22 8:29 PM, Jonathan Cameron wrote:
> On Fri, 3 Jun 2022 14:10:47 +0530
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
> 
>> On 5/27/22 7:45 PM, Jonathan Cameron wrote:
>>> On Fri, 27 May 2022 17:55:23 +0530
>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
>>>    
>>>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
>>>>
>>>> Add support to read/write the memory tierindex for a NUMA node.
>>>>
>>>> /sys/devices/system/node/nodeN/memtier
>>>>
>>>> where N = node id
>>>>
>>>> When read, It list the memory tier that the node belongs to.
>>>>
>>>> When written, the kernel moves the node into the specified
>>>> memory tier, the tier assignment of all other nodes are not
>>>> affected.
>>>>
>>>> If the memory tier does not exist, writing to the above file
>>>> create the tier and assign the NUMA node to that tier.
>>> creates
>>>
>>> There was some discussion in v2 of Wei Xu's RFC that what matter
>>> for creation is the rank, not the tier number.
>>>
>>> My suggestion is move to an explicit creation file such as
>>> memtier/create_tier_from_rank
>>> to which writing the rank gives results in a new tier
>>> with the next device ID and requested rank.
>>
>> I think the below workflow is much simpler.
>>
>> :/sys/devices/system# cat memtier/memtier1/nodelist
>> 1-3
>> :/sys/devices/system# cat node/node1/memtier
>> 1
>> :/sys/devices/system# ls memtier/memtier*
>> nodelist  power  rank  subsystem  uevent
>> /sys/devices/system# ls memtier/
>> default_rank  max_tier  memtier1  power  uevent
>> :/sys/devices/system# echo 2 > node/node1/memtier
>> :/sys/devices/system#
>>
>> :/sys/devices/system# ls memtier/
>> default_rank  max_tier  memtier1  memtier2  power  uevent
>> :/sys/devices/system# cat memtier/memtier1/nodelist
>> 2-3
>> :/sys/devices/system# cat memtier/memtier2/nodelist
>> 1
>> :/sys/devices/system#
>>
>> ie, to create a tier we just write the tier id/tier index to
>> node/nodeN/memtier file. That will create a new memory tier if needed
>> and add the node to that specific memory tier. Since for now we are
>> having 1:1 mapping between tier index to rank value, we can derive the
>> rank value from the memory tier index.
>>
>> For dynamic memory tier support, we can assign a rank value such that
>> new memory tiers are always created such that it comes last in the
>> demotion order.
> 
> I'm not keen on having to pass through an intermediate state where
> the rank may well be wrong, but I guess it's not that harmful even
> if it feels wrong ;)
> 

Any new memory tier added can be of lowest rank (rank - 0) and hence 
will appear as the highest memory tier in demotion order. User can then
assign the right rank value to the memory tier? Also the actual demotion 
target paths are built during memory block online which in most case 
would happen after we properly verify that the device got assigned to 
the right memory tier with correct rank value?

> Races are potentially a bit of a pain though depending on what we
> expect the usage model to be.
> 
> There are patterns (CXL regions for example) of guaranteeing the
> 'right' device is created by doing something like
> 
> cat create_tier > temp.txt
> #(temp gets 2 for example on first call then
> # next read of this file gets 3 etc)
> 
> cat temp.txt > create_tier
> # will fail if there hasn't been a read of the same value
> 
> Assuming all software keeps to the model, then there are no
> race conditions over creation.  Otherwise we have two new
> devices turn up very close to each other and userspace scripting
> tries to create two new tiers - if it races they may end up in
> the same tier when that wasn't the intent.  Then code to set
> the rank also races and we get two potentially very different
> memories in a tier with a randomly selected rank.
> 
> Fun and games...  And a fine illustration why sysfs based 'device'
> creation is tricky to get right (and lots of cases in the kernel
> don't).
> 

I would expect userspace to be careful and verify the memory tier and 
rank value before we online the memory blocks backed by the device. Even 
if we race, the result would be two device not intended to be part of 
the same memory tier appearing at the same tier. But then we won't be 
building demotion targets yet. So userspace could verify this, move the 
nodes out of the memory tier. Once it is verified, memory blocks can be 
onlined.

Having said that can you outline the usage of 
memtier/create_tier_from_rank ?

-aneesh
Jonathan Cameron June 6, 2022, 4:16 p.m. UTC | #5
On Mon, 6 Jun 2022 21:31:16 +0530
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:

> On 6/6/22 8:29 PM, Jonathan Cameron wrote:
> > On Fri, 3 Jun 2022 14:10:47 +0530
> > Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
> >   
> >> On 5/27/22 7:45 PM, Jonathan Cameron wrote:  
> >>> On Fri, 27 May 2022 17:55:23 +0530
> >>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
> >>>      
> >>>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> >>>>
> >>>> Add support to read/write the memory tierindex for a NUMA node.
> >>>>
> >>>> /sys/devices/system/node/nodeN/memtier
> >>>>
> >>>> where N = node id
> >>>>
> >>>> When read, It list the memory tier that the node belongs to.
> >>>>
> >>>> When written, the kernel moves the node into the specified
> >>>> memory tier, the tier assignment of all other nodes are not
> >>>> affected.
> >>>>
> >>>> If the memory tier does not exist, writing to the above file
> >>>> create the tier and assign the NUMA node to that tier.  
> >>> creates
> >>>
> >>> There was some discussion in v2 of Wei Xu's RFC that what matter
> >>> for creation is the rank, not the tier number.
> >>>
> >>> My suggestion is move to an explicit creation file such as
> >>> memtier/create_tier_from_rank
> >>> to which writing the rank gives results in a new tier
> >>> with the next device ID and requested rank.  
> >>
> >> I think the below workflow is much simpler.
> >>
> >> :/sys/devices/system# cat memtier/memtier1/nodelist
> >> 1-3
> >> :/sys/devices/system# cat node/node1/memtier
> >> 1
> >> :/sys/devices/system# ls memtier/memtier*
> >> nodelist  power  rank  subsystem  uevent
> >> /sys/devices/system# ls memtier/
> >> default_rank  max_tier  memtier1  power  uevent
> >> :/sys/devices/system# echo 2 > node/node1/memtier
> >> :/sys/devices/system#
> >>
> >> :/sys/devices/system# ls memtier/
> >> default_rank  max_tier  memtier1  memtier2  power  uevent
> >> :/sys/devices/system# cat memtier/memtier1/nodelist
> >> 2-3
> >> :/sys/devices/system# cat memtier/memtier2/nodelist
> >> 1
> >> :/sys/devices/system#
> >>
> >> ie, to create a tier we just write the tier id/tier index to
> >> node/nodeN/memtier file. That will create a new memory tier if needed
> >> and add the node to that specific memory tier. Since for now we are
> >> having 1:1 mapping between tier index to rank value, we can derive the
> >> rank value from the memory tier index.
> >>
> >> For dynamic memory tier support, we can assign a rank value such that
> >> new memory tiers are always created such that it comes last in the
> >> demotion order.  
> > 
> > I'm not keen on having to pass through an intermediate state where
> > the rank may well be wrong, but I guess it's not that harmful even
> > if it feels wrong ;)
> >   
> 
> Any new memory tier added can be of lowest rank (rank - 0) and hence 
> will appear as the highest memory tier in demotion order. 

Depends on driver interaction - if new memory is CXL attached or
GPU attached, chances are the driver has an input on which tier
it is put in by default.

> User can then
> assign the right rank value to the memory tier? Also the actual demotion 
> target paths are built during memory block online which in most case 
> would happen after we properly verify that the device got assigned to 
> the right memory tier with correct rank value?

Agreed, though that may change the model of how memory is brought online
somewhat.

> 
> > Races are potentially a bit of a pain though depending on what we
> > expect the usage model to be.
> > 
> > There are patterns (CXL regions for example) of guaranteeing the
> > 'right' device is created by doing something like
> > 
> > cat create_tier > temp.txt
> > #(temp gets 2 for example on first call then
> > # next read of this file gets 3 etc)
> > 
> > cat temp.txt > create_tier
> > # will fail if there hasn't been a read of the same value
> > 
> > Assuming all software keeps to the model, then there are no
> > race conditions over creation.  Otherwise we have two new
> > devices turn up very close to each other and userspace scripting
> > tries to create two new tiers - if it races they may end up in
> > the same tier when that wasn't the intent.  Then code to set
> > the rank also races and we get two potentially very different
> > memories in a tier with a randomly selected rank.
> > 
> > Fun and games...  And a fine illustration why sysfs based 'device'
> > creation is tricky to get right (and lots of cases in the kernel
> > don't).
> >   
> 
> I would expect userspace to be careful and verify the memory tier and 
> rank value before we online the memory blocks backed by the device. Even 
> if we race, the result would be two device not intended to be part of 
> the same memory tier appearing at the same tier. But then we won't be 
> building demotion targets yet. So userspace could verify this, move the 
> nodes out of the memory tier. Once it is verified, memory blocks can be 
> onlined.

The race is there and not avoidable as far as I can see. Two processes A and B.

A checks for a spare tier number
B checks for a spare tier number
A tries to assign node 3 to new tier 2 (new tier created)
B tries to assign node 4 to new tier 2 (accidentally hits existing tier - as this
is the same method we'd use to put it in the existing tier we can't tell this
write was meant to create a new tier).
A writes rank 100 to tier 2
A checks rank for tier 2 and finds it is 100 as expected.
B write rank 200 to tier 2 (it could check if still default but even that is racy)
B checks rank for tier 2 rank and finds it is 200 as expected.
A onlines memory.
B onlines memory.

Both think they got what they wanted, but A definitely didn't.

One work around is the read / write approach and create_tier.

A reads create_tier - gets 2.
B reads create_tier - gets 3.
A writes 2 to create_tier as that's what it read.
B writes 3 to create_tier as that's what it read.

continue with created tiers.  Obviously can exhaust tiers, but if this is
root only, could just create lots anyway so no worse off.
 
> 
> Having said that can you outline the usage of 
> memtier/create_tier_from_rank ?

There are corner cases to deal with...

A writes 100 to create_tier_from_rank.
A goes looking for matching tier - finds it: tier2
B writes 200 to create_tier_from_rank
B goes looking for matching tier - finds it: tier3

rest is fine as operating on different tiers.

Trickier is
A writes 100 to create_tier_from_rank  - succeed.
B writes 100 to create_tier_from_rank  - Could fail, or could just eat it?

Logically this is same as separate create_tier and then a write
of rank, but in one operation, but then you need to search
for the right one.  As such, perhaps a create_tier
that does the read/write pair as above is the best solution.

Jonathan


> 
> -aneesh
Aneesh Kumar K.V June 6, 2022, 4:39 p.m. UTC | #6
On 6/6/22 9:46 PM, Jonathan Cameron wrote:
> On Mon, 6 Jun 2022 21:31:16 +0530
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
> 
>> On 6/6/22 8:29 PM, Jonathan Cameron wrote:
>>> On Fri, 3 Jun 2022 14:10:47 +0530
>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
>>>    
>>>> On 5/27/22 7:45 PM, Jonathan Cameron wrote:
>>>>> On Fri, 27 May 2022 17:55:23 +0530
>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
>>>>>       
>>>>>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
>>>>>>
>>>>>> Add support to read/write the memory tierindex for a NUMA node.
>>>>>>
>>>>>> /sys/devices/system/node/nodeN/memtier
>>>>>>
>>>>>> where N = node id
>>>>>>
>>>>>> When read, It list the memory tier that the node belongs to.
>>>>>>
>>>>>> When written, the kernel moves the node into the specified
>>>>>> memory tier, the tier assignment of all other nodes are not
>>>>>> affected.
>>>>>>
>>>>>> If the memory tier does not exist, writing to the above file
>>>>>> create the tier and assign the NUMA node to that tier.
>>>>> creates
>>>>>
>>>>> There was some discussion in v2 of Wei Xu's RFC that what matter
>>>>> for creation is the rank, not the tier number.
>>>>>
>>>>> My suggestion is move to an explicit creation file such as
>>>>> memtier/create_tier_from_rank
>>>>> to which writing the rank gives results in a new tier
>>>>> with the next device ID and requested rank.
>>>>
>>>> I think the below workflow is much simpler.
>>>>
>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
>>>> 1-3
>>>> :/sys/devices/system# cat node/node1/memtier
>>>> 1
>>>> :/sys/devices/system# ls memtier/memtier*
>>>> nodelist  power  rank  subsystem  uevent
>>>> /sys/devices/system# ls memtier/
>>>> default_rank  max_tier  memtier1  power  uevent
>>>> :/sys/devices/system# echo 2 > node/node1/memtier
>>>> :/sys/devices/system#
>>>>
>>>> :/sys/devices/system# ls memtier/
>>>> default_rank  max_tier  memtier1  memtier2  power  uevent
>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
>>>> 2-3
>>>> :/sys/devices/system# cat memtier/memtier2/nodelist
>>>> 1
>>>> :/sys/devices/system#
>>>>
>>>> ie, to create a tier we just write the tier id/tier index to
>>>> node/nodeN/memtier file. That will create a new memory tier if needed
>>>> and add the node to that specific memory tier. Since for now we are
>>>> having 1:1 mapping between tier index to rank value, we can derive the
>>>> rank value from the memory tier index.
>>>>
>>>> For dynamic memory tier support, we can assign a rank value such that
>>>> new memory tiers are always created such that it comes last in the
>>>> demotion order.
>>>
>>> I'm not keen on having to pass through an intermediate state where
>>> the rank may well be wrong, but I guess it's not that harmful even
>>> if it feels wrong ;)
>>>    
>>
>> Any new memory tier added can be of lowest rank (rank - 0) and hence
>> will appear as the highest memory tier in demotion order.
> 
> Depends on driver interaction - if new memory is CXL attached or
> GPU attached, chances are the driver has an input on which tier
> it is put in by default.
> 
>> User can then
>> assign the right rank value to the memory tier? Also the actual demotion
>> target paths are built during memory block online which in most case
>> would happen after we properly verify that the device got assigned to
>> the right memory tier with correct rank value?
> 
> Agreed, though that may change the model of how memory is brought online
> somewhat.
> 
>>
>>> Races are potentially a bit of a pain though depending on what we
>>> expect the usage model to be.
>>>
>>> There are patterns (CXL regions for example) of guaranteeing the
>>> 'right' device is created by doing something like
>>>
>>> cat create_tier > temp.txt
>>> #(temp gets 2 for example on first call then
>>> # next read of this file gets 3 etc)
>>>
>>> cat temp.txt > create_tier
>>> # will fail if there hasn't been a read of the same value
>>>
>>> Assuming all software keeps to the model, then there are no
>>> race conditions over creation.  Otherwise we have two new
>>> devices turn up very close to each other and userspace scripting
>>> tries to create two new tiers - if it races they may end up in
>>> the same tier when that wasn't the intent.  Then code to set
>>> the rank also races and we get two potentially very different
>>> memories in a tier with a randomly selected rank.
>>>
>>> Fun and games...  And a fine illustration why sysfs based 'device'
>>> creation is tricky to get right (and lots of cases in the kernel
>>> don't).
>>>    
>>
>> I would expect userspace to be careful and verify the memory tier and
>> rank value before we online the memory blocks backed by the device. Even
>> if we race, the result would be two device not intended to be part of
>> the same memory tier appearing at the same tier. But then we won't be
>> building demotion targets yet. So userspace could verify this, move the
>> nodes out of the memory tier. Once it is verified, memory blocks can be
>> onlined.
> 
> The race is there and not avoidable as far as I can see. Two processes A and B.
> 
> A checks for a spare tier number
> B checks for a spare tier number
> A tries to assign node 3 to new tier 2 (new tier created)
> B tries to assign node 4 to new tier 2 (accidentally hits existing tier - as this
> is the same method we'd use to put it in the existing tier we can't tell this
> write was meant to create a new tier).
> A writes rank 100 to tier 2
> A checks rank for tier 2 and finds it is 100 as expected.
> B write rank 200 to tier 2 (it could check if still default but even that is racy)
> B checks rank for tier 2 rank and finds it is 200 as expected.
> A onlines memory.
> B onlines memory.
> 
> Both think they got what they wanted, but A definitely didn't.
> 
> One work around is the read / write approach and create_tier.
> 
> A reads create_tier - gets 2.
> B reads create_tier - gets 3.
> A writes 2 to create_tier as that's what it read.
> B writes 3 to create_tier as that's what it read.
> 
> continue with created tiers.  Obviously can exhaust tiers, but if this is
> root only, could just create lots anyway so no worse off.
>   
>>
>> Having said that can you outline the usage of
>> memtier/create_tier_from_rank ?
> 
> There are corner cases to deal with...
> 
> A writes 100 to create_tier_from_rank.
> A goes looking for matching tier - finds it: tier2
> B writes 200 to create_tier_from_rank
> B goes looking for matching tier - finds it: tier3
> 
> rest is fine as operating on different tiers.
> 
> Trickier is
> A writes 100 to create_tier_from_rank  - succeed.
> B writes 100 to create_tier_from_rank  - Could fail, or could just eat it?
> 
> Logically this is same as separate create_tier and then a write
> of rank, but in one operation, but then you need to search
> for the right one.  As such, perhaps a create_tier
> that does the read/write pair as above is the best solution.
> 

This all is good when we allow dynamic rank values. But currently we are 
restricting ourselves to three rank value as below:

rank   memtier
300    memtier0
200    memtier1
100    memtier2

Now with the above, how do we define a write to create_tier_from_rank. 
What should be the behavior if user write value other than above defined 
rank values? Also enforcing the above three rank values as supported 
implies teaching userspace about them. I am trying to see how to fit
create_tier_from_rank without requiring the above.

Can we look at implementing create_tier_from_rank when we start 
supporting dynamic tiers/rank values? ie,

we still allow node/nodeN/memtier. But with dynamic tiers a race free
way to get a new memory tier would be echo rank > 
memtier/create_tier_from_rank. We could also say, memtier0/1/2 are 
kernel defined memory tiers. Writing to memtier/create_tier_from_rank 
will create new memory tiers above memtier2 with the rank value specified?

-aneesh
Aneesh Kumar K.V June 6, 2022, 5:46 p.m. UTC | #7
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 6/6/22 9:46 PM, Jonathan Cameron wrote:
>> On Mon, 6 Jun 2022 21:31:16 +0530
>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
>> 
>>> On 6/6/22 8:29 PM, Jonathan Cameron wrote:
>>>> On Fri, 3 Jun 2022 14:10:47 +0530
>>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
>>>>    
>>>>> On 5/27/22 7:45 PM, Jonathan Cameron wrote:
>>>>>> On Fri, 27 May 2022 17:55:23 +0530
>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
>>>>>>       
>>>>>>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
>>>>>>>
>>>>>>> Add support to read/write the memory tierindex for a NUMA node.
>>>>>>>
>>>>>>> /sys/devices/system/node/nodeN/memtier
>>>>>>>
>>>>>>> where N = node id
>>>>>>>
>>>>>>> When read, It list the memory tier that the node belongs to.
>>>>>>>
>>>>>>> When written, the kernel moves the node into the specified
>>>>>>> memory tier, the tier assignment of all other nodes are not
>>>>>>> affected.
>>>>>>>
>>>>>>> If the memory tier does not exist, writing to the above file
>>>>>>> create the tier and assign the NUMA node to that tier.
>>>>>> creates
>>>>>>
>>>>>> There was some discussion in v2 of Wei Xu's RFC that what matter
>>>>>> for creation is the rank, not the tier number.
>>>>>>
>>>>>> My suggestion is move to an explicit creation file such as
>>>>>> memtier/create_tier_from_rank
>>>>>> to which writing the rank gives results in a new tier
>>>>>> with the next device ID and requested rank.
>>>>>
>>>>> I think the below workflow is much simpler.
>>>>>
>>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
>>>>> 1-3
>>>>> :/sys/devices/system# cat node/node1/memtier
>>>>> 1
>>>>> :/sys/devices/system# ls memtier/memtier*
>>>>> nodelist  power  rank  subsystem  uevent
>>>>> /sys/devices/system# ls memtier/
>>>>> default_rank  max_tier  memtier1  power  uevent
>>>>> :/sys/devices/system# echo 2 > node/node1/memtier
>>>>> :/sys/devices/system#
>>>>>
>>>>> :/sys/devices/system# ls memtier/
>>>>> default_rank  max_tier  memtier1  memtier2  power  uevent
>>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
>>>>> 2-3
>>>>> :/sys/devices/system# cat memtier/memtier2/nodelist
>>>>> 1
>>>>> :/sys/devices/system#
>>>>>
>>>>> ie, to create a tier we just write the tier id/tier index to
>>>>> node/nodeN/memtier file. That will create a new memory tier if needed
>>>>> and add the node to that specific memory tier. Since for now we are
>>>>> having 1:1 mapping between tier index to rank value, we can derive the
>>>>> rank value from the memory tier index.
>>>>>
>>>>> For dynamic memory tier support, we can assign a rank value such that
>>>>> new memory tiers are always created such that it comes last in the
>>>>> demotion order.
>>>>
>>>> I'm not keen on having to pass through an intermediate state where
>>>> the rank may well be wrong, but I guess it's not that harmful even
>>>> if it feels wrong ;)
>>>>    
>>>
>>> Any new memory tier added can be of lowest rank (rank - 0) and hence
>>> will appear as the highest memory tier in demotion order.
>> 
>> Depends on driver interaction - if new memory is CXL attached or
>> GPU attached, chances are the driver has an input on which tier
>> it is put in by default.
>> 
>>> User can then
>>> assign the right rank value to the memory tier? Also the actual demotion
>>> target paths are built during memory block online which in most case
>>> would happen after we properly verify that the device got assigned to
>>> the right memory tier with correct rank value?
>> 
>> Agreed, though that may change the model of how memory is brought online
>> somewhat.
>> 
>>>
>>>> Races are potentially a bit of a pain though depending on what we
>>>> expect the usage model to be.
>>>>
>>>> There are patterns (CXL regions for example) of guaranteeing the
>>>> 'right' device is created by doing something like
>>>>
>>>> cat create_tier > temp.txt
>>>> #(temp gets 2 for example on first call then
>>>> # next read of this file gets 3 etc)
>>>>
>>>> cat temp.txt > create_tier
>>>> # will fail if there hasn't been a read of the same value
>>>>
>>>> Assuming all software keeps to the model, then there are no
>>>> race conditions over creation.  Otherwise we have two new
>>>> devices turn up very close to each other and userspace scripting
>>>> tries to create two new tiers - if it races they may end up in
>>>> the same tier when that wasn't the intent.  Then code to set
>>>> the rank also races and we get two potentially very different
>>>> memories in a tier with a randomly selected rank.
>>>>
>>>> Fun and games...  And a fine illustration why sysfs based 'device'
>>>> creation is tricky to get right (and lots of cases in the kernel
>>>> don't).
>>>>    
>>>
>>> I would expect userspace to be careful and verify the memory tier and
>>> rank value before we online the memory blocks backed by the device. Even
>>> if we race, the result would be two device not intended to be part of
>>> the same memory tier appearing at the same tier. But then we won't be
>>> building demotion targets yet. So userspace could verify this, move the
>>> nodes out of the memory tier. Once it is verified, memory blocks can be
>>> onlined.
>> 
>> The race is there and not avoidable as far as I can see. Two processes A and B.
>> 
>> A checks for a spare tier number
>> B checks for a spare tier number
>> A tries to assign node 3 to new tier 2 (new tier created)
>> B tries to assign node 4 to new tier 2 (accidentally hits existing tier - as this
>> is the same method we'd use to put it in the existing tier we can't tell this
>> write was meant to create a new tier).
>> A writes rank 100 to tier 2
>> A checks rank for tier 2 and finds it is 100 as expected.
>> B write rank 200 to tier 2 (it could check if still default but even that is racy)
>> B checks rank for tier 2 rank and finds it is 200 as expected.
>> A onlines memory.
>> B onlines memory.
>> 
>> Both think they got what they wanted, but A definitely didn't.
>> 
>> One work around is the read / write approach and create_tier.
>> 
>> A reads create_tier - gets 2.
>> B reads create_tier - gets 3.
>> A writes 2 to create_tier as that's what it read.
>> B writes 3 to create_tier as that's what it read.
>> 
>> continue with created tiers.  Obviously can exhaust tiers, but if this is
>> root only, could just create lots anyway so no worse off.
>>   
>>>
>>> Having said that can you outline the usage of
>>> memtier/create_tier_from_rank ?
>> 
>> There are corner cases to deal with...
>> 
>> A writes 100 to create_tier_from_rank.
>> A goes looking for matching tier - finds it: tier2
>> B writes 200 to create_tier_from_rank
>> B goes looking for matching tier - finds it: tier3
>> 
>> rest is fine as operating on different tiers.
>> 
>> Trickier is
>> A writes 100 to create_tier_from_rank  - succeed.
>> B writes 100 to create_tier_from_rank  - Could fail, or could just eat it?
>> 
>> Logically this is same as separate create_tier and then a write
>> of rank, but in one operation, but then you need to search
>> for the right one.  As such, perhaps a create_tier
>> that does the read/write pair as above is the best solution.
>> 
>
> This all is good when we allow dynamic rank values. But currently we are 
> restricting ourselves to three rank value as below:
>
> rank   memtier
> 300    memtier0
> 200    memtier1
> 100    memtier2
>
> Now with the above, how do we define a write to create_tier_from_rank. 
> What should be the behavior if user write value other than above defined 
> rank values? Also enforcing the above three rank values as supported 
> implies teaching userspace about them. I am trying to see how to fit
> create_tier_from_rank without requiring the above.
>
> Can we look at implementing create_tier_from_rank when we start 
> supporting dynamic tiers/rank values? ie,
>
> we still allow node/nodeN/memtier. But with dynamic tiers a race free
> way to get a new memory tier would be echo rank > 
> memtier/create_tier_from_rank. We could also say, memtier0/1/2 are 
> kernel defined memory tiers. Writing to memtier/create_tier_from_rank 
> will create new memory tiers above memtier2 with the rank value specified?
>

To keep it compatible we could do this. ie, we just allow creation of
one additional memory tier (memtier3) via the above interface.


:/sys/devices/system/memtier# ls -al
total 0
drwxr-xr-x  4 root root    0 Jun  6 17:39 .
drwxr-xr-x 10 root root    0 Jun  6 17:39 ..
--w-------  1 root root 4096 Jun  6 17:40 create_tier_from_rank
-r--r--r--  1 root root 4096 Jun  6 17:40 default_tier
-r--r--r--  1 root root 4096 Jun  6 17:40 max_tier
drwxr-xr-x  3 root root    0 Jun  6 17:39 memtier1
drwxr-xr-x  2 root root    0 Jun  6 17:40 power
-rw-r--r--  1 root root 4096 Jun  6 17:39 uevent
:/sys/devices/system/memtier# echo 20 > create_tier_from_rank 
:/sys/devices/system/memtier# ls
create_tier_from_rank  default_tier  max_tier  memtier1  memtier3  power  uevent
:/sys/devices/system/memtier# cat memtier3/rank 
20
:/sys/devices/system/memtier# echo 20 > create_tier_from_rank 
bash: echo: write error: No space left on device
:/sys/devices/system/memtier# 

is this good? 

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 0468af60d427..a4150120ba24 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -13,7 +13,7 @@
 #define MEMORY_RANK_PMEM	100
 
 #define DEFAULT_MEMORY_TIER	MEMORY_TIER_DRAM
-#define MAX_MEMORY_TIERS  3
+#define MAX_MEMORY_TIERS  4
 
 extern bool numa_demotion_enabled;
 extern nodemask_t promotion_mask;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c6eb223a219f..7fdee0c4c4ea 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -169,7 +169,8 @@ static void insert_memory_tier(struct memory_tier *memtier)
 	list_add_tail(&memtier->list, &memory_tiers);
 }
 
-static struct memory_tier *register_memory_tier(unsigned int tier)
+static struct memory_tier *register_memory_tier(unsigned int tier,
+						unsigned int rank)
 {
 	int error;
 	struct memory_tier *memtier;
@@ -182,7 +183,7 @@ static struct memory_tier *register_memory_tier(unsigned int tier)
 		return NULL;
 
 	memtier->dev.id = tier;
-	memtier->rank = get_rank_from_tier(tier);
+	memtier->rank = rank;
 	memtier->dev.bus = &memory_tier_subsys;
 	memtier->dev.release = memory_tier_device_release;
 	memtier->dev.groups = memory_tier_dev_groups;
@@ -218,9 +219,53 @@ default_tier_show(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR_RO(default_tier);
 
+
+static struct memory_tier *__get_memory_tier_from_id(int id);
+static ssize_t create_tier_from_rank_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	int ret, rank;
+	struct memory_tier *memtier;
+
+	ret = kstrtouint(buf, 10, &rank);
+	if (ret)
+		return ret;
+
+	if (ret == MEMORY_RANK_HBM_GPU ||
+	    rank == MEMORY_TIER_DRAM ||
+	    rank == MEMORY_RANK_PMEM)
+		return -EINVAL;
+
+	mutex_lock(&memory_tier_lock);
+	/*
+	 * For now we only support creation of one additional tier via
+	 * this interface.
+	 */
+	memtier = __get_memory_tier_from_id(3);
+	if (!memtier) {
+		memtier = register_memory_tier(3, rank);
+		if (!memtier) {
+			ret = -EINVAL;
+			goto out;
+		}
+	} else {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = count;
+out:
+	mutex_unlock(&memory_tier_lock);
+	return ret;
+}
+static DEVICE_ATTR_WO(create_tier_from_rank);
+
+
 static struct attribute *memory_tier_attrs[] = {
 	&dev_attr_max_tier.attr,
 	&dev_attr_default_tier.attr,
+	&dev_attr_create_tier_from_rank.attr,
 	NULL
 };
 
@@ -302,7 +347,7 @@ static int __node_set_memory_tier(int node, int tier)
 
 	memtier = __get_memory_tier_from_id(tier);
 	if (!memtier) {
-		memtier = register_memory_tier(tier);
+		memtier = register_memory_tier(tier, get_rank_from_tier(tier));
 		if (!memtier) {
 			ret = -EINVAL;
 			goto out;
@@ -651,7 +696,8 @@ static int __init memory_tier_init(void)
 	 * Register only default memory tier to hide all empty
 	 * memory tier from sysfs.
 	 */
-	memtier = register_memory_tier(DEFAULT_MEMORY_TIER);
+	memtier = register_memory_tier(DEFAULT_MEMORY_TIER,
+				       get_rank_from_tier(DEFAULT_MEMORY_TIER));
 	if (!memtier)
 		panic("%s() failed to register memory tier: %d\n", __func__, ret);
Jonathan Cameron June 7, 2022, 2:32 p.m. UTC | #8
On Mon, 06 Jun 2022 23:16:15 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:

> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> 
> > On 6/6/22 9:46 PM, Jonathan Cameron wrote:  
> >> On Mon, 6 Jun 2022 21:31:16 +0530
> >> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
> >>   
> >>> On 6/6/22 8:29 PM, Jonathan Cameron wrote:  
> >>>> On Fri, 3 Jun 2022 14:10:47 +0530
> >>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> wrote:
> >>>>      
> >>>>> On 5/27/22 7:45 PM, Jonathan Cameron wrote:  
> >>>>>> On Fri, 27 May 2022 17:55:23 +0530
> >>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:
> >>>>>>         
> >>>>>>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> >>>>>>>
> >>>>>>> Add support to read/write the memory tierindex for a NUMA node.
> >>>>>>>
> >>>>>>> /sys/devices/system/node/nodeN/memtier
> >>>>>>>
> >>>>>>> where N = node id
> >>>>>>>
> >>>>>>> When read, It list the memory tier that the node belongs to.
> >>>>>>>
> >>>>>>> When written, the kernel moves the node into the specified
> >>>>>>> memory tier, the tier assignment of all other nodes are not
> >>>>>>> affected.
> >>>>>>>
> >>>>>>> If the memory tier does not exist, writing to the above file
> >>>>>>> create the tier and assign the NUMA node to that tier.  
> >>>>>> creates
> >>>>>>
> >>>>>> There was some discussion in v2 of Wei Xu's RFC that what matter
> >>>>>> for creation is the rank, not the tier number.
> >>>>>>
> >>>>>> My suggestion is move to an explicit creation file such as
> >>>>>> memtier/create_tier_from_rank
> >>>>>> to which writing the rank gives results in a new tier
> >>>>>> with the next device ID and requested rank.  
> >>>>>
> >>>>> I think the below workflow is much simpler.
> >>>>>
> >>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
> >>>>> 1-3
> >>>>> :/sys/devices/system# cat node/node1/memtier
> >>>>> 1
> >>>>> :/sys/devices/system# ls memtier/memtier*
> >>>>> nodelist  power  rank  subsystem  uevent
> >>>>> /sys/devices/system# ls memtier/
> >>>>> default_rank  max_tier  memtier1  power  uevent
> >>>>> :/sys/devices/system# echo 2 > node/node1/memtier
> >>>>> :/sys/devices/system#
> >>>>>
> >>>>> :/sys/devices/system# ls memtier/
> >>>>> default_rank  max_tier  memtier1  memtier2  power  uevent
> >>>>> :/sys/devices/system# cat memtier/memtier1/nodelist
> >>>>> 2-3
> >>>>> :/sys/devices/system# cat memtier/memtier2/nodelist
> >>>>> 1
> >>>>> :/sys/devices/system#
> >>>>>
> >>>>> ie, to create a tier we just write the tier id/tier index to
> >>>>> node/nodeN/memtier file. That will create a new memory tier if needed
> >>>>> and add the node to that specific memory tier. Since for now we are
> >>>>> having 1:1 mapping between tier index to rank value, we can derive the
> >>>>> rank value from the memory tier index.
> >>>>>
> >>>>> For dynamic memory tier support, we can assign a rank value such that
> >>>>> new memory tiers are always created such that it comes last in the
> >>>>> demotion order.  
> >>>>
> >>>> I'm not keen on having to pass through an intermediate state where
> >>>> the rank may well be wrong, but I guess it's not that harmful even
> >>>> if it feels wrong ;)
> >>>>      
> >>>
> >>> Any new memory tier added can be of lowest rank (rank - 0) and hence
> >>> will appear as the highest memory tier in demotion order.  
> >> 
> >> Depends on driver interaction - if new memory is CXL attached or
> >> GPU attached, chances are the driver has an input on which tier
> >> it is put in by default.
> >>   
> >>> User can then
> >>> assign the right rank value to the memory tier? Also the actual demotion
> >>> target paths are built during memory block online which in most case
> >>> would happen after we properly verify that the device got assigned to
> >>> the right memory tier with correct rank value?  
> >> 
> >> Agreed, though that may change the model of how memory is brought online
> >> somewhat.
> >>   
> >>>  
> >>>> Races are potentially a bit of a pain though depending on what we
> >>>> expect the usage model to be.
> >>>>
> >>>> There are patterns (CXL regions for example) of guaranteeing the
> >>>> 'right' device is created by doing something like
> >>>>
> >>>> cat create_tier > temp.txt
> >>>> #(temp gets 2 for example on first call then
> >>>> # next read of this file gets 3 etc)
> >>>>
> >>>> cat temp.txt > create_tier
> >>>> # will fail if there hasn't been a read of the same value
> >>>>
> >>>> Assuming all software keeps to the model, then there are no
> >>>> race conditions over creation.  Otherwise we have two new
> >>>> devices turn up very close to each other and userspace scripting
> >>>> tries to create two new tiers - if it races they may end up in
> >>>> the same tier when that wasn't the intent.  Then code to set
> >>>> the rank also races and we get two potentially very different
> >>>> memories in a tier with a randomly selected rank.
> >>>>
> >>>> Fun and games...  And a fine illustration why sysfs based 'device'
> >>>> creation is tricky to get right (and lots of cases in the kernel
> >>>> don't).
> >>>>      
> >>>
> >>> I would expect userspace to be careful and verify the memory tier and
> >>> rank value before we online the memory blocks backed by the device. Even
> >>> if we race, the result would be two device not intended to be part of
> >>> the same memory tier appearing at the same tier. But then we won't be
> >>> building demotion targets yet. So userspace could verify this, move the
> >>> nodes out of the memory tier. Once it is verified, memory blocks can be
> >>> onlined.  
> >> 
> >> The race is there and not avoidable as far as I can see. Two processes A and B.
> >> 
> >> A checks for a spare tier number
> >> B checks for a spare tier number
> >> A tries to assign node 3 to new tier 2 (new tier created)
> >> B tries to assign node 4 to new tier 2 (accidentally hits existing tier - as this
> >> is the same method we'd use to put it in the existing tier we can't tell this
> >> write was meant to create a new tier).
> >> A writes rank 100 to tier 2
> >> A checks rank for tier 2 and finds it is 100 as expected.
> >> B write rank 200 to tier 2 (it could check if still default but even that is racy)
> >> B checks rank for tier 2 rank and finds it is 200 as expected.
> >> A onlines memory.
> >> B onlines memory.
> >> 
> >> Both think they got what they wanted, but A definitely didn't.
> >> 
> >> One work around is the read / write approach and create_tier.
> >> 
> >> A reads create_tier - gets 2.
> >> B reads create_tier - gets 3.
> >> A writes 2 to create_tier as that's what it read.
> >> B writes 3 to create_tier as that's what it read.
> >> 
> >> continue with created tiers.  Obviously can exhaust tiers, but if this is
> >> root only, could just create lots anyway so no worse off.
> >>     
> >>>
> >>> Having said that can you outline the usage of
> >>> memtier/create_tier_from_rank ?  
> >> 
> >> There are corner cases to deal with...
> >> 
> >> A writes 100 to create_tier_from_rank.
> >> A goes looking for matching tier - finds it: tier2
> >> B writes 200 to create_tier_from_rank
> >> B goes looking for matching tier - finds it: tier3
> >> 
> >> rest is fine as operating on different tiers.
> >> 
> >> Trickier is
> >> A writes 100 to create_tier_from_rank  - succeed.
> >> B writes 100 to create_tier_from_rank  - Could fail, or could just eat it?
> >> 
> >> Logically this is same as separate create_tier and then a write
> >> of rank, but in one operation, but then you need to search
> >> for the right one.  As such, perhaps a create_tier
> >> that does the read/write pair as above is the best solution.
> >>   
> >
> > This all is good when we allow dynamic rank values. But currently we are 
> > restricting ourselves to three rank value as below:
> >
> > rank   memtier
> > 300    memtier0
> > 200    memtier1
> > 100    memtier2
> >
> > Now with the above, how do we define a write to create_tier_from_rank. 
> > What should be the behavior if user write value other than above defined 
> > rank values? Also enforcing the above three rank values as supported 
> > implies teaching userspace about them. I am trying to see how to fit
> > create_tier_from_rank without requiring the above.
> >
> > Can we look at implementing create_tier_from_rank when we start 
> > supporting dynamic tiers/rank values? ie,
> >
> > we still allow node/nodeN/memtier. But with dynamic tiers a race free
> > way to get a new memory tier would be echo rank > 
> > memtier/create_tier_from_rank. We could also say, memtier0/1/2 are 
> > kernel defined memory tiers. Writing to memtier/create_tier_from_rank 
> > will create new memory tiers above memtier2 with the rank value specified?
> >  
> 
> To keep it compatible we could do this. ie, we just allow creation of
> one additional memory tier (memtier3) via the above interface.

Two options - either have no dynamic tier creation for now (which I'm
fine with) or define an interface that will work long term - so that
means allowing lots of dynamic tiers from day one (where "lots" is at least 2
to prove the interface works if the limit is scaled up in future)

A half way house where we potentially need to change the interface later
is not a good stop gap.

That means we should either deal with the race conditions today, or
declare that we don't care about them (which might be a valid statement, but
userspace needs to be aware of that - ultimately it means userspace must
ensure mediation via some userspace software component).

Jonathan

> 
> 
> :/sys/devices/system/memtier# ls -al
> total 0
> drwxr-xr-x  4 root root    0 Jun  6 17:39 .
> drwxr-xr-x 10 root root    0 Jun  6 17:39 ..
> --w-------  1 root root 4096 Jun  6 17:40 create_tier_from_rank
> -r--r--r--  1 root root 4096 Jun  6 17:40 default_tier
> -r--r--r--  1 root root 4096 Jun  6 17:40 max_tier
> drwxr-xr-x  3 root root    0 Jun  6 17:39 memtier1
> drwxr-xr-x  2 root root    0 Jun  6 17:40 power
> -rw-r--r--  1 root root 4096 Jun  6 17:39 uevent
> :/sys/devices/system/memtier# echo 20 > create_tier_from_rank 
> :/sys/devices/system/memtier# ls
> create_tier_from_rank  default_tier  max_tier  memtier1  memtier3  power  uevent
> :/sys/devices/system/memtier# cat memtier3/rank 
> 20
> :/sys/devices/system/memtier# echo 20 > create_tier_from_rank 
> bash: echo: write error: No space left on device
> :/sys/devices/system/memtier# 
> 
> is this good? 
> 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index 0468af60d427..a4150120ba24 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -13,7 +13,7 @@
>  #define MEMORY_RANK_PMEM	100
>  
>  #define DEFAULT_MEMORY_TIER	MEMORY_TIER_DRAM
> -#define MAX_MEMORY_TIERS  3
> +#define MAX_MEMORY_TIERS  4
>  
>  extern bool numa_demotion_enabled;
>  extern nodemask_t promotion_mask;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index c6eb223a219f..7fdee0c4c4ea 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -169,7 +169,8 @@ static void insert_memory_tier(struct memory_tier *memtier)
>  	list_add_tail(&memtier->list, &memory_tiers);
>  }
>  
> -static struct memory_tier *register_memory_tier(unsigned int tier)
> +static struct memory_tier *register_memory_tier(unsigned int tier,
> +						unsigned int rank)
>  {
>  	int error;
>  	struct memory_tier *memtier;
> @@ -182,7 +183,7 @@ static struct memory_tier *register_memory_tier(unsigned int tier)
>  		return NULL;
>  
>  	memtier->dev.id = tier;
> -	memtier->rank = get_rank_from_tier(tier);
> +	memtier->rank = rank;
>  	memtier->dev.bus = &memory_tier_subsys;
>  	memtier->dev.release = memory_tier_device_release;
>  	memtier->dev.groups = memory_tier_dev_groups;
> @@ -218,9 +219,53 @@ default_tier_show(struct device *dev, struct device_attribute *attr, char *buf)
>  }
>  static DEVICE_ATTR_RO(default_tier);
>  
> +
> +static struct memory_tier *__get_memory_tier_from_id(int id);
> +static ssize_t create_tier_from_rank_store(struct device *dev,
> +					   struct device_attribute *attr,
> +					   const char *buf, size_t count)
> +{
> +	int ret, rank;
> +	struct memory_tier *memtier;
> +
> +	ret = kstrtouint(buf, 10, &rank);
> +	if (ret)
> +		return ret;
> +
> +	if (ret == MEMORY_RANK_HBM_GPU ||
> +	    rank == MEMORY_TIER_DRAM ||
> +	    rank == MEMORY_RANK_PMEM)
> +		return -EINVAL;
> +
> +	mutex_lock(&memory_tier_lock);
> +	/*
> +	 * For now we only support creation of one additional tier via
> +	 * this interface.
> +	 */
> +	memtier = __get_memory_tier_from_id(3);
> +	if (!memtier) {
> +		memtier = register_memory_tier(3, rank);
> +		if (!memtier) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +	} else {
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	ret = count;
> +out:
> +	mutex_unlock(&memory_tier_lock);
> +	return ret;
> +}
> +static DEVICE_ATTR_WO(create_tier_from_rank);
> +
> +
>  static struct attribute *memory_tier_attrs[] = {
>  	&dev_attr_max_tier.attr,
>  	&dev_attr_default_tier.attr,
> +	&dev_attr_create_tier_from_rank.attr,
>  	NULL
>  };
>  
> @@ -302,7 +347,7 @@ static int __node_set_memory_tier(int node, int tier)
>  
>  	memtier = __get_memory_tier_from_id(tier);
>  	if (!memtier) {
> -		memtier = register_memory_tier(tier);
> +		memtier = register_memory_tier(tier, get_rank_from_tier(tier));
>  		if (!memtier) {
>  			ret = -EINVAL;
>  			goto out;
> @@ -651,7 +696,8 @@ static int __init memory_tier_init(void)
>  	 * Register only default memory tier to hide all empty
>  	 * memory tier from sysfs.
>  	 */
> -	memtier = register_memory_tier(DEFAULT_MEMORY_TIER);
> +	memtier = register_memory_tier(DEFAULT_MEMORY_TIER,
> +				       get_rank_from_tier(DEFAULT_MEMORY_TIER));
>  	if (!memtier)
>  		panic("%s() failed to register memory tier: %d\n", __func__, ret);
>  
>
Huang, Ying June 8, 2022, 7:18 a.m. UTC | #9
On Fri, 2022-05-27 at 17:55 +0530, Aneesh Kumar K.V wrote:
> From: Jagdish Gediya <jvgediya@linux.ibm.com>
> 
> Add support to read/write the memory tierindex for a NUMA node.
> 
> /sys/devices/system/node/nodeN/memtier
> 
> where N = node id
> 
> When read, It list the memory tier that the node belongs to.
> 
> When written, the kernel moves the node into the specified
> memory tier, the tier assignment of all other nodes are not
> affected.
> 
> If the memory tier does not exist, writing to the above file
> create the tier and assign the NUMA node to that tier.
> 
> mutex memory_tier_lock is introduced to protect memory tier
> related chanegs as it can happen from sysfs as well on hot
> plug events.
> 
> Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  drivers/base/node.c     |  35 ++++++++++++++
>  include/linux/migrate.h |   4 +-
>  mm/migrate.c            | 103 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 141 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index ec8bb24a5a22..cf4a58446d8c 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -20,6 +20,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> +#include <linux/migrate.h>
>  
> 
> 
> 
>  static struct bus_type node_subsys = {
>  	.name = "node",
> @@ -560,11 +561,45 @@ static ssize_t node_read_distance(struct device *dev,
>  }
>  static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>  
> 
> 
> 
> +#ifdef CONFIG_TIERED_MEMORY
> +static ssize_t memtier_show(struct device *dev,
> +			    struct device_attribute *attr,
> +			    char *buf)
> +{
> +	int node = dev->id;
> +
> +	return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
> +}
> +
> +static ssize_t memtier_store(struct device *dev,
> +			     struct device_attribute *attr,
> +			     const char *buf, size_t count)
> +{
> +	unsigned long tier;
> +	int node = dev->id;
> +
> +	int ret = kstrtoul(buf, 10, &tier);
> +	if (ret)
> +		return ret;
> +
> +	ret = node_reset_memory_tier(node, tier);
> +	if (ret)
> +		return ret;
> +
> +	return count;
> +}
> +
> +static DEVICE_ATTR_RW(memtier);
> +#endif
> +
>  static struct attribute *node_dev_attrs[] = {
>  	&dev_attr_meminfo.attr,
>  	&dev_attr_numastat.attr,
>  	&dev_attr_distance.attr,
>  	&dev_attr_vmstat.attr,
> +#ifdef CONFIG_TIERED_MEMORY
> +	&dev_attr_memtier.attr,
> +#endif
>  	NULL
>  };
>  
> 
> 
> 
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 0ec653623565..d37d1d5dee82 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -177,13 +177,15 @@ enum memory_tier_type {
>  };
>  
> 
> 
> 
>  int next_demotion_node(int node);
> -
>  extern void migrate_on_reclaim_init(void);
>  #ifdef CONFIG_HOTPLUG_CPU
>  extern void set_migration_target_nodes(void);
>  #else
>  static inline void set_migration_target_nodes(void) {}
>  #endif
> +int node_get_memory_tier(int node);
> +int node_set_memory_tier(int node, int tier);
> +int node_reset_memory_tier(int node, int tier);
>  #else
>  #define numa_demotion_enabled	false
>  static inline int next_demotion_node(int node)
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f28ee93fb017..304559ba3372 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2132,6 +2132,7 @@ static struct bus_type memory_tier_subsys = {
>  	.dev_name = "memtier",
>  };
>  
> 
> 
> 
> +DEFINE_MUTEX(memory_tier_lock);
>  static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
>  
> 
> 
> 
>  static ssize_t nodelist_show(struct device *dev,
> @@ -2225,6 +2226,108 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
>  	NULL,
>  };
>  
> 
> 
> 
> +static int __node_get_memory_tier(int node)
> +{
> +	int tier;
> +
> +	for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
> +		if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
> +			return tier;
> +	}
> +
> +	return -1;
> +}
> +
> +int node_get_memory_tier(int node)
> +{
> +	int tier;
> +
> +	/*
> +	 * Make sure memory tier is not unregistered
> +	 * while it is being read.
> +	 */
> +	mutex_lock(&memory_tier_lock);
> +
> +	tier = __node_get_memory_tier(node);
> +
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return tier;
> +}
> +
> +int __node_set_memory_tier(int node, int tier)
> +{
> +	int ret = 0;
> +	/*
> +	 * As register_memory_tier() for new tier can fail,
> +	 * try it before modifying existing tier. register
> +	 * tier makes tier visible in sysfs.
> +	 */
> +	if (!memory_tiers[tier]) {
> +		ret = register_memory_tier(tier);
> +		if (ret) {
> +			goto out;
> +		}
> +	}
> +
> +	node_set(node, memory_tiers[tier]->nodelist);
> +
> +out:
> +	return ret;
> +}
> +
> +int node_reset_memory_tier(int node, int tier)

I think "reset" isn't a good name here.  Maybe something like "change"
or "move"?

Best Regards,
Huang, Ying

> +{
> +	int current_tier, ret = 0;
> +
> +	mutex_lock(&memory_tier_lock);
> +
> +	current_tier = __node_get_memory_tier(node);
> +	if (current_tier == tier)
> +		goto out;
> +
> +	if (current_tier != -1 )
> +		node_clear(node, memory_tiers[current_tier]->nodelist);
> +
> +	ret = __node_set_memory_tier(node, tier);
> +
> +	if (!ret) {
> +		if (nodes_empty(memory_tiers[current_tier]->nodelist))
> +			unregister_memory_tier(current_tier);
> +	} else {
> +		/* reset it back to older tier */
> +		ret = __node_set_memory_tier(node, current_tier);
> +	}
> +out:
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return ret;
> +}
> +
> +int node_set_memory_tier(int node, int tier)
> +{
> +	int current_tier, ret = 0;
> +
> +	if (tier >= MAX_MEMORY_TIERS)
> +		return -EINVAL;
> +
> +	mutex_lock(&memory_tier_lock);
> +	current_tier = __node_get_memory_tier(node);
> +	/*
> +	 * if node is already part of the tier proceed with the
> +	 * current tier value, because we might want to establish
> +	 * new migration paths now. The node might be added to a tier
> +	 * before it was made part of N_MEMORY, hence estabilish_migration_targets
> +	 * will have skipped this node.
> +	 */
> +	if (current_tier != -1)
> +		tier = current_tier;
> +	ret = __node_set_memory_tier(node, tier);
> +	mutex_unlock(&memory_tier_lock);
> +
> +	return ret;
> +}
> +
>  /*
>   * node_demotion[] example:
>   *
Aneesh Kumar K.V June 8, 2022, 8:25 a.m. UTC | #10
On 6/8/22 12:48 PM, Ying Huang wrote:
> On Fri, 2022-05-27 at 17:55 +0530, Aneesh Kumar K.V wrote:
>> From: Jagdish Gediya <jvgediya@linux.ibm.com>
>>
>> Add support to read/write the memory tierindex for a NUMA node.
>>
>> /sys/devices/system/node/nodeN/memtier
>>
>> where N = node id
>>
>> When read, It list the memory tier that the node belongs to.
>>
>> When written, the kernel moves the node into the specified
>> memory tier, the tier assignment of all other nodes are not
>> affected.
>>
>> If the memory tier does not exist, writing to the above file
>> create the tier and assign the NUMA node to that tier.
>>
>> mutex memory_tier_lock is introduced to protect memory tier
>> related chanegs as it can happen from sysfs as well on hot
>> plug events.
>>
>> Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   drivers/base/node.c     |  35 ++++++++++++++
>>   include/linux/migrate.h |   4 +-
>>   mm/migrate.c            | 103 ++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 141 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index ec8bb24a5a22..cf4a58446d8c 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <linux/swap.h>
>>   #include <linux/slab.h>
>> +#include <linux/migrate.h>
>>   
>>
>>
>>
>>   static struct bus_type node_subsys = {
>>   	.name = "node",
>> @@ -560,11 +561,45 @@ static ssize_t node_read_distance(struct device *dev,
>>   }
>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>   
>>
>>
>>
>> +#ifdef CONFIG_TIERED_MEMORY
>> +static ssize_t memtier_show(struct device *dev,
>> +			    struct device_attribute *attr,
>> +			    char *buf)
>> +{
>> +	int node = dev->id;
>> +
>> +	return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
>> +}
>> +
>> +static ssize_t memtier_store(struct device *dev,
>> +			     struct device_attribute *attr,
>> +			     const char *buf, size_t count)
>> +{
>> +	unsigned long tier;
>> +	int node = dev->id;
>> +
>> +	int ret = kstrtoul(buf, 10, &tier);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = node_reset_memory_tier(node, tier);
>> +	if (ret)
>> +		return ret;
>> +
>> +	return count;
>> +}
>> +
>> +static DEVICE_ATTR_RW(memtier);
>> +#endif
>> +
>>   static struct attribute *node_dev_attrs[] = {
>>   	&dev_attr_meminfo.attr,
>>   	&dev_attr_numastat.attr,
>>   	&dev_attr_distance.attr,
>>   	&dev_attr_vmstat.attr,
>> +#ifdef CONFIG_TIERED_MEMORY
>> +	&dev_attr_memtier.attr,
>> +#endif
>>   	NULL
>>   };
>>   
>>
>>
>>
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index 0ec653623565..d37d1d5dee82 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -177,13 +177,15 @@ enum memory_tier_type {
>>   };
>>   
>>
>>
>>
>>   int next_demotion_node(int node);
>> -
>>   extern void migrate_on_reclaim_init(void);
>>   #ifdef CONFIG_HOTPLUG_CPU
>>   extern void set_migration_target_nodes(void);
>>   #else
>>   static inline void set_migration_target_nodes(void) {}
>>   #endif
>> +int node_get_memory_tier(int node);
>> +int node_set_memory_tier(int node, int tier);
>> +int node_reset_memory_tier(int node, int tier);
>>   #else
>>   #define numa_demotion_enabled	false
>>   static inline int next_demotion_node(int node)
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index f28ee93fb017..304559ba3372 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -2132,6 +2132,7 @@ static struct bus_type memory_tier_subsys = {
>>   	.dev_name = "memtier",
>>   };
>>   
>>
>>
>>
>> +DEFINE_MUTEX(memory_tier_lock);
>>   static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
>>   
>>
>>
>>
>>   static ssize_t nodelist_show(struct device *dev,
>> @@ -2225,6 +2226,108 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
>>   	NULL,
>>   };
>>   
>>
>>
>>
>> +static int __node_get_memory_tier(int node)
>> +{
>> +	int tier;
>> +
>> +	for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
>> +		if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
>> +			return tier;
>> +	}
>> +
>> +	return -1;
>> +}
>> +
>> +int node_get_memory_tier(int node)
>> +{
>> +	int tier;
>> +
>> +	/*
>> +	 * Make sure memory tier is not unregistered
>> +	 * while it is being read.
>> +	 */
>> +	mutex_lock(&memory_tier_lock);
>> +
>> +	tier = __node_get_memory_tier(node);
>> +
>> +	mutex_unlock(&memory_tier_lock);
>> +
>> +	return tier;
>> +}
>> +
>> +int __node_set_memory_tier(int node, int tier)
>> +{
>> +	int ret = 0;
>> +	/*
>> +	 * As register_memory_tier() for new tier can fail,
>> +	 * try it before modifying existing tier. register
>> +	 * tier makes tier visible in sysfs.
>> +	 */
>> +	if (!memory_tiers[tier]) {
>> +		ret = register_memory_tier(tier);
>> +		if (ret) {
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	node_set(node, memory_tiers[tier]->nodelist);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +int node_reset_memory_tier(int node, int tier)
> 
> I think "reset" isn't a good name here.  Maybe something like "change"
> or "move"?
> 

how about node_update_memory_tier()?

-aneesh
Huang, Ying June 8, 2022, 8:29 a.m. UTC | #11
On Wed, 2022-06-08 at 13:55 +0530, Aneesh Kumar K V wrote:
> On 6/8/22 12:48 PM, Ying Huang wrote:
> > On Fri, 2022-05-27 at 17:55 +0530, Aneesh Kumar K.V wrote:
> > > From: Jagdish Gediya <jvgediya@linux.ibm.com>
> > > 
> > > Add support to read/write the memory tierindex for a NUMA node.
> > > 
> > > /sys/devices/system/node/nodeN/memtier
> > > 
> > > where N = node id
> > > 
> > > When read, It list the memory tier that the node belongs to.
> > > 
> > > When written, the kernel moves the node into the specified
> > > memory tier, the tier assignment of all other nodes are not
> > > affected.
> > > 
> > > If the memory tier does not exist, writing to the above file
> > > create the tier and assign the NUMA node to that tier.
> > > 
> > > mutex memory_tier_lock is introduced to protect memory tier
> > > related chanegs as it can happen from sysfs as well on hot
> > > plug events.
> > > 
> > > Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
> > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > > ---
> > >   drivers/base/node.c     |  35 ++++++++++++++
> > >   include/linux/migrate.h |   4 +-
> > >   mm/migrate.c            | 103 ++++++++++++++++++++++++++++++++++++++++
> > >   3 files changed, 141 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > > index ec8bb24a5a22..cf4a58446d8c 100644
> > > --- a/drivers/base/node.c
> > > +++ b/drivers/base/node.c
> > > @@ -20,6 +20,7 @@
> > >   #include <linux/pm_runtime.h>
> > >   #include <linux/swap.h>
> > >   #include <linux/slab.h>
> > > +#include <linux/migrate.h>
> > >   
> > > 
> > > 
> > > 
> > > 
> > >   static struct bus_type node_subsys = {
> > >   	.name = "node",
> > > @@ -560,11 +561,45 @@ static ssize_t node_read_distance(struct device *dev,
> > >   }
> > >   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
> > >   
> > > 
> > > 
> > > 
> > > 
> > > +#ifdef CONFIG_TIERED_MEMORY
> > > +static ssize_t memtier_show(struct device *dev,
> > > +			    struct device_attribute *attr,
> > > +			    char *buf)
> > > +{
> > > +	int node = dev->id;
> > > +
> > > +	return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
> > > +}
> > > +
> > > +static ssize_t memtier_store(struct device *dev,
> > > +			     struct device_attribute *attr,
> > > +			     const char *buf, size_t count)
> > > +{
> > > +	unsigned long tier;
> > > +	int node = dev->id;
> > > +
> > > +	int ret = kstrtoul(buf, 10, &tier);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	ret = node_reset_memory_tier(node, tier);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	return count;
> > > +}
> > > +
> > > +static DEVICE_ATTR_RW(memtier);
> > > +#endif
> > > +
> > >   static struct attribute *node_dev_attrs[] = {
> > >   	&dev_attr_meminfo.attr,
> > >   	&dev_attr_numastat.attr,
> > >   	&dev_attr_distance.attr,
> > >   	&dev_attr_vmstat.attr,
> > > +#ifdef CONFIG_TIERED_MEMORY
> > > +	&dev_attr_memtier.attr,
> > > +#endif
> > >   	NULL
> > >   };
> > >   
> > > 
> > > 
> > > 
> > > 
> > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > > index 0ec653623565..d37d1d5dee82 100644
> > > --- a/include/linux/migrate.h
> > > +++ b/include/linux/migrate.h
> > > @@ -177,13 +177,15 @@ enum memory_tier_type {
> > >   };
> > >   
> > > 
> > > 
> > > 
> > > 
> > >   int next_demotion_node(int node);
> > > -
> > >   extern void migrate_on_reclaim_init(void);
> > >   #ifdef CONFIG_HOTPLUG_CPU
> > >   extern void set_migration_target_nodes(void);
> > >   #else
> > >   static inline void set_migration_target_nodes(void) {}
> > >   #endif
> > > +int node_get_memory_tier(int node);
> > > +int node_set_memory_tier(int node, int tier);
> > > +int node_reset_memory_tier(int node, int tier);
> > >   #else
> > >   #define numa_demotion_enabled	false
> > >   static inline int next_demotion_node(int node)
> > > diff --git a/mm/migrate.c b/mm/migrate.c
> > > index f28ee93fb017..304559ba3372 100644
> > > --- a/mm/migrate.c
> > > +++ b/mm/migrate.c
> > > @@ -2132,6 +2132,7 @@ static struct bus_type memory_tier_subsys = {
> > >   	.dev_name = "memtier",
> > >   };
> > >   
> > > 
> > > 
> > > 
> > > 
> > > +DEFINE_MUTEX(memory_tier_lock);
> > >   static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
> > >   
> > > 
> > > 
> > > 
> > > 
> > >   static ssize_t nodelist_show(struct device *dev,
> > > @@ -2225,6 +2226,108 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
> > >   	NULL,
> > >   };
> > >   
> > > 
> > > 
> > > 
> > > 
> > > +static int __node_get_memory_tier(int node)
> > > +{
> > > +	int tier;
> > > +
> > > +	for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
> > > +		if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
> > > +			return tier;
> > > +	}
> > > +
> > > +	return -1;
> > > +}
> > > +
> > > +int node_get_memory_tier(int node)
> > > +{
> > > +	int tier;
> > > +
> > > +	/*
> > > +	 * Make sure memory tier is not unregistered
> > > +	 * while it is being read.
> > > +	 */
> > > +	mutex_lock(&memory_tier_lock);
> > > +
> > > +	tier = __node_get_memory_tier(node);
> > > +
> > > +	mutex_unlock(&memory_tier_lock);
> > > +
> > > +	return tier;
> > > +}
> > > +
> > > +int __node_set_memory_tier(int node, int tier)
> > > +{
> > > +	int ret = 0;
> > > +	/*
> > > +	 * As register_memory_tier() for new tier can fail,
> > > +	 * try it before modifying existing tier. register
> > > +	 * tier makes tier visible in sysfs.
> > > +	 */
> > > +	if (!memory_tiers[tier]) {
> > > +		ret = register_memory_tier(tier);
> > > +		if (ret) {
> > > +			goto out;
> > > +		}
> > > +	}
> > > +
> > > +	node_set(node, memory_tiers[tier]->nodelist);
> > > +
> > > +out:
> > > +	return ret;
> > > +}
> > > +
> > > +int node_reset_memory_tier(int node, int tier)
> > 
> > I think "reset" isn't a good name here.  Maybe something like "change"
> > or "move"?
> > 
> 
> how about node_update_memory_tier()?

That sounds OK for me.

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ec8bb24a5a22..cf4a58446d8c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -20,6 +20,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/migrate.h>
 
 static struct bus_type node_subsys = {
 	.name = "node",
@@ -560,11 +561,45 @@  static ssize_t node_read_distance(struct device *dev,
 }
 static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
 
+#ifdef CONFIG_TIERED_MEMORY
+static ssize_t memtier_show(struct device *dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	int node = dev->id;
+
+	return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
+}
+
+static ssize_t memtier_store(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	unsigned long tier;
+	int node = dev->id;
+
+	int ret = kstrtoul(buf, 10, &tier);
+	if (ret)
+		return ret;
+
+	ret = node_reset_memory_tier(node, tier);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(memtier);
+#endif
+
 static struct attribute *node_dev_attrs[] = {
 	&dev_attr_meminfo.attr,
 	&dev_attr_numastat.attr,
 	&dev_attr_distance.attr,
 	&dev_attr_vmstat.attr,
+#ifdef CONFIG_TIERED_MEMORY
+	&dev_attr_memtier.attr,
+#endif
 	NULL
 };
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0ec653623565..d37d1d5dee82 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -177,13 +177,15 @@  enum memory_tier_type {
 };
 
 int next_demotion_node(int node);
-
 extern void migrate_on_reclaim_init(void);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void set_migration_target_nodes(void);
 #else
 static inline void set_migration_target_nodes(void) {}
 #endif
+int node_get_memory_tier(int node);
+int node_set_memory_tier(int node, int tier);
+int node_reset_memory_tier(int node, int tier);
 #else
 #define numa_demotion_enabled	false
 static inline int next_demotion_node(int node)
diff --git a/mm/migrate.c b/mm/migrate.c
index f28ee93fb017..304559ba3372 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2132,6 +2132,7 @@  static struct bus_type memory_tier_subsys = {
 	.dev_name = "memtier",
 };
 
+DEFINE_MUTEX(memory_tier_lock);
 static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
 
 static ssize_t nodelist_show(struct device *dev,
@@ -2225,6 +2226,108 @@  static const struct attribute_group *memory_tier_attr_groups[] = {
 	NULL,
 };
 
+static int __node_get_memory_tier(int node)
+{
+	int tier;
+
+	for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
+		if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
+			return tier;
+	}
+
+	return -1;
+}
+
+int node_get_memory_tier(int node)
+{
+	int tier;
+
+	/*
+	 * Make sure memory tier is not unregistered
+	 * while it is being read.
+	 */
+	mutex_lock(&memory_tier_lock);
+
+	tier = __node_get_memory_tier(node);
+
+	mutex_unlock(&memory_tier_lock);
+
+	return tier;
+}
+
+int __node_set_memory_tier(int node, int tier)
+{
+	int ret = 0;
+	/*
+	 * As register_memory_tier() for new tier can fail,
+	 * try it before modifying existing tier. register
+	 * tier makes tier visible in sysfs.
+	 */
+	if (!memory_tiers[tier]) {
+		ret = register_memory_tier(tier);
+		if (ret) {
+			goto out;
+		}
+	}
+
+	node_set(node, memory_tiers[tier]->nodelist);
+
+out:
+	return ret;
+}
+
+int node_reset_memory_tier(int node, int tier)
+{
+	int current_tier, ret = 0;
+
+	mutex_lock(&memory_tier_lock);
+
+	current_tier = __node_get_memory_tier(node);
+	if (current_tier == tier)
+		goto out;
+
+	if (current_tier != -1 )
+		node_clear(node, memory_tiers[current_tier]->nodelist);
+
+	ret = __node_set_memory_tier(node, tier);
+
+	if (!ret) {
+		if (nodes_empty(memory_tiers[current_tier]->nodelist))
+			unregister_memory_tier(current_tier);
+	} else {
+		/* reset it back to older tier */
+		ret = __node_set_memory_tier(node, current_tier);
+	}
+out:
+	mutex_unlock(&memory_tier_lock);
+
+	return ret;
+}
+
+int node_set_memory_tier(int node, int tier)
+{
+	int current_tier, ret = 0;
+
+	if (tier >= MAX_MEMORY_TIERS)
+		return -EINVAL;
+
+	mutex_lock(&memory_tier_lock);
+	current_tier = __node_get_memory_tier(node);
+	/*
+	 * if node is already part of the tier proceed with the
+	 * current tier value, because we might want to establish
+	 * new migration paths now. The node might be added to a tier
+	 * before it was made part of N_MEMORY, hence estabilish_migration_targets
+	 * will have skipped this node.
+	 */
+	if (current_tier != -1)
+		tier = current_tier;
+	ret = __node_set_memory_tier(node, tier);
+	mutex_unlock(&memory_tier_lock);
+
+	return ret;
+}
+
 /*
  * node_demotion[] example:
  *