diff mbox series

[v2] mm/demotion: Expose memory tier details via sysfs

Message ID 20220829060745.287468-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series [v2] mm/demotion: Expose memory tier details via sysfs | expand

Commit Message

Aneesh Kumar K.V Aug. 29, 2022, 6:07 a.m. UTC
This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
related details can be found. All allocated memory tiers will be listed
there as /sys/devices/virtual/memory_tiering/memory_tierN/

The nodes which are part of a specific memory tier can be listed via
/sys/devices/virtual/memory_tiering/memory_tierN/nodes

The abstract distance range value of a specific memory tier can be listed via
/sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance

A directory hierarchy looks like
:/sys/devices/virtual/memory_tiering$ tree memory_tier4/
memory_tier4/
├── abstract_distance
├── nodes
├── subsystem -> ../../../../bus/memory_tiering
└── uevent

All toptier nodes are listed via
/sys/devices/virtual/memory_tiering/toptier_nodes

:/sys/devices/virtual/memory_tiering$ cat toptier_nodes
0,2
:/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
0,2
:/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
512 - 639

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../ABI/testing/sysfs-kernel-mm-memory-tiers  |  41 +++++
 mm/memory-tiers.c                             | 155 +++++++++++++++---
 2 files changed, 174 insertions(+), 22 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers

Comments

Andrew Morton Aug. 29, 2022, 9 p.m. UTC | #1
On Mon, 29 Aug 2022 11:37:45 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:

> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
> related details can be found. All allocated memory tiers will be listed
> there as /sys/devices/virtual/memory_tiering/memory_tierN/
> 
> The nodes which are part of a specific memory tier can be listed via
> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
> 
> The abstract distance range value of a specific memory tier can be listed via
> /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
> 
> A directory hierarchy looks like
> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
> memory_tier4/
> ├── abstract_distance
> ├── nodes
> ├── subsystem -> ../../../../bus/memory_tiering
> └── uevent
> 
> All toptier nodes are listed via
> /sys/devices/virtual/memory_tiering/toptier_nodes
> 
> :/sys/devices/virtual/memory_tiering$ cat toptier_nodes
> 0,2
> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
> 0,2
> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
> 512 - 639
> 
> ...
>
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -19,6 +19,7 @@ struct memory_tier {
>  	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
>  	 */
>  	int adistance_start;
> +	struct device dev;
>  	/* All the nodes that are part of all the lower memory tiers. */
>  	nodemask_t lower_tier_mask;
>  };
> @@ -36,6 +37,13 @@ static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  static struct memory_dev_type *default_dram_type;
> +
> +#define to_memory_tier(device) container_of(device, struct memory_tier, dev)

Please, only use a macro when a C function cannot be used.

--- a/mm/memory-tiers.c~mm-demotion-expose-memory-tier-details-via-sysfs-fix
+++ a/mm/memory-tiers.c
@@ -38,7 +38,11 @@ static LIST_HEAD(memory_tiers);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 static struct memory_dev_type *default_dram_type;
 
-#define to_memory_tier(device) container_of(device, struct memory_tier, dev)
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+	return container_of(device, struct memory_tier, dev);
+}
+
 static struct bus_type memory_tier_subsys = {
 	.name = "memory_tiering",
 	.dev_name = "memory_tier",

If only so it checks the type of `device'.
Wei Xu Aug. 30, 2022, 6:31 a.m. UTC | #2
On Sun, Aug 28, 2022 at 11:08 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
> related details can be found. All allocated memory tiers will be listed
> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>
> The nodes which are part of a specific memory tier can be listed via
> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>
> The abstract distance range value of a specific memory tier can be listed via
> /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>
> A directory hierarchy looks like
> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
> memory_tier4/
> ├── abstract_distance
> ├── nodes
> ├── subsystem -> ../../../../bus/memory_tiering
> └── uevent
>
> All toptier nodes are listed via
> /sys/devices/virtual/memory_tiering/toptier_nodes
>
> :/sys/devices/virtual/memory_tiering$ cat toptier_nodes
> 0,2
> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
> 0,2
> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
> 512 - 639
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  .../ABI/testing/sysfs-kernel-mm-memory-tiers  |  41 +++++
>  mm/memory-tiers.c                             | 155 +++++++++++++++---
>  2 files changed, 174 insertions(+), 22 deletions(-)
>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
> new file mode 100644
> index 000000000000..6955f69a4423
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
> @@ -0,0 +1,41 @@
> +What:          /sys/devices/virtual/memory_tiering/
> +Date:          August 2022
> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> +Description:   A collection of all the memory tiers allocated.
> +
> +               Individual memory tier details are contained in subdirectories
> +               named by the abstract distance of the memory tier.
> +
> +               /sys/devices/virtual/memory_tiering/memory_tierN/
> +
> +
> +What:          /sys/devices/virtual/memory_tiering/memory_tierN/
> +               /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
> +               /sys/devices/virtual/memory_tiering/memory_tierN/nodes
> +Date:          August 2022
> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> +Description:   Directory with details of a specific memory tier
> +
> +               This is the directory containing information about a particular
> +               memory tier, memtierN, where N is derived based on abstract distance.
> +
> +               A smaller value of N implies a higher (faster) memory tier in the
> +               hierarchy.

Given that abstract_distance is provided, it would be more flexible if
we don't commit to the interface where N in memtierN also indicates
the memory tier ordering.

> +
> +               abstract_distance: The abstract distance range this specific memory
> +               tier maps to.

I still think the name of "abstract distance" is kind of confusing
because it is not clear what is the other object that this distance
value is relative to.  Do we have to expose this value at this point
if N in memtierN can already indicate the memory tier ordering?

> +               nodes: NUMA nodes that are part of this memory tier.
> +
> +
> +What:          /sys/devices/virtual/memory_tiering/toptier_nodes
> +Date:          August 2022
> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> +Description:   Toptier node mask
> +
> +               A toptier is defined as the memory tier from which memory promotion
> +               is not done by the kernel.
> +
> +               toptier_nodes: NUMA nodes that are part of all the memory tiers
> +               above a topier tier.

Nit: topier -> toptier

toptier_nodes should be the union of NUMA nodes that are part of each
toptier, not above a toptier, right?

> +
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index c4bd6d052a33..d4648d4e4d54 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -19,6 +19,7 @@ struct memory_tier {
>          * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
>          */
>         int adistance_start;
> +       struct device dev;
>         /* All the nodes that are part of all the lower memory tiers. */
>         nodemask_t lower_tier_mask;
>  };
> @@ -36,6 +37,13 @@ static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  static struct memory_dev_type *default_dram_type;
> +
> +#define to_memory_tier(device) container_of(device, struct memory_tier, dev)
> +static struct bus_type memory_tier_subsys = {
> +       .name = "memory_tiering",
> +       .dev_name = "memory_tier",
> +};
> +
>  #ifdef CONFIG_MIGRATION
>  static int top_tier_adistance;
>  /*
> @@ -98,8 +106,73 @@ static int top_tier_adistance;
>  static struct demotion_nodes *node_demotion __read_mostly;
>  #endif /* CONFIG_MIGRATION */
>
> +static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +{
> +       nodemask_t nodes = NODE_MASK_NONE;
> +       struct memory_dev_type *memtype;
> +
> +       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
> +               nodes_or(nodes, nodes, memtype->nodes);
> +
> +       return nodes;
> +}
> +
> +static void memory_tier_device_release(struct device *dev)
> +{
> +       struct memory_tier *tier = to_memory_tier(dev);
> +       /*
> +        * synchronize_rcu in clear_node_memory_tier makes sure
> +        * we don't have rcu access to this memory tier.
> +        */
> +       kfree(tier);
> +}
> +
> +static ssize_t nodes_show(struct device *dev,
> +                         struct device_attribute *attr, char *buf)
> +{
> +       int ret;
> +       nodemask_t nmask;
> +
> +       mutex_lock(&memory_tier_lock);
> +       nmask = get_memtier_nodemask(to_memory_tier(dev));
> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
> +       mutex_unlock(&memory_tier_lock);
> +       return ret;
> +}
> +static DEVICE_ATTR_RO(nodes);
> +
> +static ssize_t abstract_distance_show(struct device *dev,
> +                                     struct device_attribute *attr, char *buf)
> +{
> +       int ret;
> +       struct memory_tier *memtier = to_memory_tier(dev);
> +
> +       mutex_lock(&memory_tier_lock);
> +       ret = sysfs_emit(buf, "%d - %d\n", memtier->adistance_start,
> +                        memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1);
> +       mutex_unlock(&memory_tier_lock);
> +       return ret;
> +}
> +static DEVICE_ATTR_RO(abstract_distance);
> +
> +static struct attribute *memtier_dev_attrs[] = {
> +       &dev_attr_nodes.attr,
> +       &dev_attr_abstract_distance.attr,
> +       NULL
> +};
> +
> +static const struct attribute_group memtier_dev_group = {
> +       .attrs = memtier_dev_attrs,
> +};
> +
> +static const struct attribute_group *memtier_dev_groups[] = {
> +       &memtier_dev_group,
> +       NULL
> +};
> +
>  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
>  {
> +       int ret;
>         bool found_slot = false;
>         struct memory_tier *memtier, *new_memtier;
>         int adistance = memtype->adistance;
> @@ -123,15 +196,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>
>         list_for_each_entry(memtier, &memory_tiers, list) {
>                 if (adistance == memtier->adistance_start) {
> -                       list_add(&memtype->tier_sibiling, &memtier->memory_types);
> -                       return memtier;
> +                       goto link_memtype;
>                 } else if (adistance < memtier->adistance_start) {
>                         found_slot = true;
>                         break;
>                 }
>         }
>
> -       new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
> +       new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
>         if (!new_memtier)
>                 return ERR_PTR(-ENOMEM);
>
> @@ -142,8 +214,23 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>                 list_add_tail(&new_memtier->list, &memtier->list);
>         else
>                 list_add_tail(&new_memtier->list, &memory_tiers);
> -       list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
> -       return new_memtier;
> +
> +       new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
> +       new_memtier->dev.bus = &memory_tier_subsys;
> +       new_memtier->dev.release = memory_tier_device_release;
> +       new_memtier->dev.groups = memtier_dev_groups;
> +
> +       ret = device_register(&new_memtier->dev);
> +       if (ret) {
> +               list_del(&memtier->list);
> +               put_device(&memtier->dev);
> +               return ERR_PTR(ret);
> +       }
> +       memtier = new_memtier;
> +
> +link_memtype:
> +       list_add(&memtype->tier_sibiling, &memtier->memory_types);
> +       return memtier;
>  }
>
>  static struct memory_tier *__node_get_memory_tier(int node)
> @@ -275,17 +362,6 @@ static void disable_all_demotion_targets(void)
>         synchronize_rcu();
>  }
>
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> -{
> -       nodemask_t nodes = NODE_MASK_NONE;
> -       struct memory_dev_type *memtype;
> -
> -       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
> -               nodes_or(nodes, nodes, memtype->nodes);
> -
> -       return nodes;
> -}
> -
>  /*
>   * Find an automatic demotion target for all memory
>   * nodes. Failing here is OK.  It might just indicate
> @@ -432,11 +508,7 @@ static struct memory_tier *set_node_memory_tier(int node)
>  static void destroy_memory_tier(struct memory_tier *memtier)
>  {
>         list_del(&memtier->list);
> -       /*
> -        * synchronize_rcu in clear_node_memory_tier makes sure
> -        * we don't have rcu access to this memory tier.
> -        */
> -       kfree(memtier);
> +       device_unregister(&memtier->dev);
>  }
>
>  static bool clear_node_memory_tier(int node)
> @@ -563,11 +635,50 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
>         return notifier_from_errno(0);
>  }
>
> +static ssize_t toptier_nodes_show(struct device *dev,
> +                                    struct device_attribute *attr, char *buf)
> +{
> +       int ret;
> +       nodemask_t nmask, top_tier_mask = NODE_MASK_NONE;
> +       struct memory_tier *memtier = to_memory_tier(dev);
> +
> +       mutex_lock(&memory_tier_lock);
> +       list_for_each_entry(memtier, &memory_tiers, list) {
> +               if (memtier->adistance_start >= top_tier_adistance)

It is kind of confusing that a tier with top_tier_adistance is not
considered as a toptier.  Can we redefine top_tier_adistance to be the
inclusive upper bound of toptiers?

> +                       break;
> +               nmask = get_memtier_nodemask(memtier);
> +               nodes_or(top_tier_mask, top_tier_mask, nmask);
> +       }
> +
> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
> +       mutex_unlock(&memory_tier_lock);
> +       return ret;
> +}
> +static DEVICE_ATTR_RO(toptier_nodes);
> +
> +static struct attribute *memtier_subsys_attrs[] = {
> +       &dev_attr_toptier_nodes.attr,
> +       NULL
> +};
> +
> +static const struct attribute_group memtier_subsys_group = {
> +       .attrs = memtier_subsys_attrs,
> +};
> +
> +static const struct attribute_group *memtier_subsys_groups[] = {
> +       &memtier_subsys_group,
> +       NULL
> +};
> +
>  static int __init memory_tier_init(void)
>  {
> -       int node;
> +       int ret, node;
>         struct memory_tier *memtier;
>
> +       ret = subsys_virtual_register(&memory_tier_subsys, memtier_subsys_groups);
> +       if (ret)
> +               panic("%s() failed to register memory tier subsystem\n", __func__);
> +
>  #ifdef CONFIG_MIGRATION
>         node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
>                                 GFP_KERNEL);
> --
> 2.37.2
>
Aneesh Kumar K.V Aug. 30, 2022, 6:46 a.m. UTC | #3
On 8/30/22 12:01 PM, Wei Xu wrote:
> On Sun, Aug 28, 2022 at 11:08 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
>> related details can be found. All allocated memory tiers will be listed
>> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>>
>> The nodes which are part of a specific memory tier can be listed via
>> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>>
>> The abstract distance range value of a specific memory tier can be listed via
>> /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>>
>> A directory hierarchy looks like
>> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
>> memory_tier4/
>> ├── abstract_distance
>> ├── nodes
>> ├── subsystem -> ../../../../bus/memory_tiering
>> └── uevent
>>
>> All toptier nodes are listed via
>> /sys/devices/virtual/memory_tiering/toptier_nodes
>>
>> :/sys/devices/virtual/memory_tiering$ cat toptier_nodes
>> 0,2
>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
>> 0,2
>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
>> 512 - 639
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  .../ABI/testing/sysfs-kernel-mm-memory-tiers  |  41 +++++
>>  mm/memory-tiers.c                             | 155 +++++++++++++++---
>>  2 files changed, 174 insertions(+), 22 deletions(-)
>>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>>
>> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>> new file mode 100644
>> index 000000000000..6955f69a4423
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>> @@ -0,0 +1,41 @@
>> +What:          /sys/devices/virtual/memory_tiering/
>> +Date:          August 2022
>> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>> +Description:   A collection of all the memory tiers allocated.
>> +
>> +               Individual memory tier details are contained in subdirectories
>> +               named by the abstract distance of the memory tier.
>> +
>> +               /sys/devices/virtual/memory_tiering/memory_tierN/
>> +
>> +
>> +What:          /sys/devices/virtual/memory_tiering/memory_tierN/
>> +               /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>> +               /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>> +Date:          August 2022
>> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>> +Description:   Directory with details of a specific memory tier
>> +
>> +               This is the directory containing information about a particular
>> +               memory tier, memtierN, where N is derived based on abstract distance.
>> +
>> +               A smaller value of N implies a higher (faster) memory tier in the
>> +               hierarchy.
> 
> Given that abstract_distance is provided, it would be more flexible if
> we don't commit to the interface where N in memtierN also indicates
> the memory tier ordering.


IIUC this is one of the request that Johannes had ie, to be able to understand the
memory tier hierarchy based on memtier name. 

>> +
>> +               abstract_distance: The abstract distance range this specific memory
>> +               tier maps to.
> 
> I still think the name of "abstract distance" is kind of confusing
> because it is not clear what is the other object that this distance
> value is relative to.  Do we have to expose this value at this point
> if N in memtierN can already indicate the memory tier ordering?
> 

I do agree that abstract distance is confusing. But IIUC we agreed that it is much better
than other names suggested and is closer to already understood "numa distance" term.

https://lore.kernel.org/linux-mm/YuLF%2FGG8x5lQvg%2Ff@cmpxchg.org/


>> +               nodes: NUMA nodes that are part of this memory tier.
>> +
>> +
>> +What:          /sys/devices/virtual/memory_tiering/toptier_nodes
>> +Date:          August 2022
>> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>> +Description:   Toptier node mask
>> +
>> +               A toptier is defined as the memory tier from which memory promotion
>> +               is not done by the kernel.
>> +
>> +               toptier_nodes: NUMA nodes that are part of all the memory tiers
>> +               above a topier tier.
> 
> Nit: topier -> toptier
> 
> toptier_nodes should be the union of NUMA nodes that are part of each
> toptier, not above a toptier, right?
> 

I will update the wording. Yes. it is the union of NUMA nodes that are 
part of each toptier. 

>> +
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index c4bd6d052a33..d4648d4e4d54 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -19,6 +19,7 @@ struct memory_tier {
>>          * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
>>          */
>>         int adistance_start;
>> +       struct device dev;
>>         /* All the nodes that are part of all the lower memory tiers. */
>>         nodemask_t lower_tier_mask;
>>  };
>> @@ -36,6 +37,13 @@ static DEFINE_MUTEX(memory_tier_lock);
>>  static LIST_HEAD(memory_tiers);
>>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>>  static struct memory_dev_type *default_dram_type;
>> +
>> +#define to_memory_tier(device) container_of(device, struct memory_tier, dev)
>> +static struct bus_type memory_tier_subsys = {
>> +       .name = "memory_tiering",
>> +       .dev_name = "memory_tier",
>> +};
>> +
>>  #ifdef CONFIG_MIGRATION
>>  static int top_tier_adistance;
>>  /*
>> @@ -98,8 +106,73 @@ static int top_tier_adistance;
>>  static struct demotion_nodes *node_demotion __read_mostly;
>>  #endif /* CONFIG_MIGRATION */
>>
>> +static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
>> +{
>> +       nodemask_t nodes = NODE_MASK_NONE;
>> +       struct memory_dev_type *memtype;
>> +
>> +       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
>> +               nodes_or(nodes, nodes, memtype->nodes);
>> +
>> +       return nodes;
>> +}
>> +
>> +static void memory_tier_device_release(struct device *dev)
>> +{
>> +       struct memory_tier *tier = to_memory_tier(dev);
>> +       /*
>> +        * synchronize_rcu in clear_node_memory_tier makes sure
>> +        * we don't have rcu access to this memory tier.
>> +        */
>> +       kfree(tier);
>> +}
>> +
>> +static ssize_t nodes_show(struct device *dev,
>> +                         struct device_attribute *attr, char *buf)
>> +{
>> +       int ret;
>> +       nodemask_t nmask;
>> +
>> +       mutex_lock(&memory_tier_lock);
>> +       nmask = get_memtier_nodemask(to_memory_tier(dev));
>> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>> +       mutex_unlock(&memory_tier_lock);
>> +       return ret;
>> +}
>> +static DEVICE_ATTR_RO(nodes);
>> +
>> +static ssize_t abstract_distance_show(struct device *dev,
>> +                                     struct device_attribute *attr, char *buf)
>> +{
>> +       int ret;
>> +       struct memory_tier *memtier = to_memory_tier(dev);
>> +
>> +       mutex_lock(&memory_tier_lock);
>> +       ret = sysfs_emit(buf, "%d - %d\n", memtier->adistance_start,
>> +                        memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1);
>> +       mutex_unlock(&memory_tier_lock);
>> +       return ret;
>> +}
>> +static DEVICE_ATTR_RO(abstract_distance);
>> +
>> +static struct attribute *memtier_dev_attrs[] = {
>> +       &dev_attr_nodes.attr,
>> +       &dev_attr_abstract_distance.attr,
>> +       NULL
>> +};
>> +
>> +static const struct attribute_group memtier_dev_group = {
>> +       .attrs = memtier_dev_attrs,
>> +};
>> +
>> +static const struct attribute_group *memtier_dev_groups[] = {
>> +       &memtier_dev_group,
>> +       NULL
>> +};
>> +
>>  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
>>  {
>> +       int ret;
>>         bool found_slot = false;
>>         struct memory_tier *memtier, *new_memtier;
>>         int adistance = memtype->adistance;
>> @@ -123,15 +196,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>>
>>         list_for_each_entry(memtier, &memory_tiers, list) {
>>                 if (adistance == memtier->adistance_start) {
>> -                       list_add(&memtype->tier_sibiling, &memtier->memory_types);
>> -                       return memtier;
>> +                       goto link_memtype;
>>                 } else if (adistance < memtier->adistance_start) {
>>                         found_slot = true;
>>                         break;
>>                 }
>>         }
>>
>> -       new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
>> +       new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
>>         if (!new_memtier)
>>                 return ERR_PTR(-ENOMEM);
>>
>> @@ -142,8 +214,23 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>>                 list_add_tail(&new_memtier->list, &memtier->list);
>>         else
>>                 list_add_tail(&new_memtier->list, &memory_tiers);
>> -       list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
>> -       return new_memtier;
>> +
>> +       new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
>> +       new_memtier->dev.bus = &memory_tier_subsys;
>> +       new_memtier->dev.release = memory_tier_device_release;
>> +       new_memtier->dev.groups = memtier_dev_groups;
>> +
>> +       ret = device_register(&new_memtier->dev);
>> +       if (ret) {
>> +               list_del(&memtier->list);
>> +               put_device(&memtier->dev);
>> +               return ERR_PTR(ret);
>> +       }
>> +       memtier = new_memtier;
>> +
>> +link_memtype:
>> +       list_add(&memtype->tier_sibiling, &memtier->memory_types);
>> +       return memtier;
>>  }
>>
>>  static struct memory_tier *__node_get_memory_tier(int node)
>> @@ -275,17 +362,6 @@ static void disable_all_demotion_targets(void)
>>         synchronize_rcu();
>>  }
>>
>> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
>> -{
>> -       nodemask_t nodes = NODE_MASK_NONE;
>> -       struct memory_dev_type *memtype;
>> -
>> -       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
>> -               nodes_or(nodes, nodes, memtype->nodes);
>> -
>> -       return nodes;
>> -}
>> -
>>  /*
>>   * Find an automatic demotion target for all memory
>>   * nodes. Failing here is OK.  It might just indicate
>> @@ -432,11 +508,7 @@ static struct memory_tier *set_node_memory_tier(int node)
>>  static void destroy_memory_tier(struct memory_tier *memtier)
>>  {
>>         list_del(&memtier->list);
>> -       /*
>> -        * synchronize_rcu in clear_node_memory_tier makes sure
>> -        * we don't have rcu access to this memory tier.
>> -        */
>> -       kfree(memtier);
>> +       device_unregister(&memtier->dev);
>>  }
>>
>>  static bool clear_node_memory_tier(int node)
>> @@ -563,11 +635,50 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
>>         return notifier_from_errno(0);
>>  }
>>
>> +static ssize_t toptier_nodes_show(struct device *dev,
>> +                                    struct device_attribute *attr, char *buf)
>> +{
>> +       int ret;
>> +       nodemask_t nmask, top_tier_mask = NODE_MASK_NONE;
>> +       struct memory_tier *memtier = to_memory_tier(dev);
>> +
>> +       mutex_lock(&memory_tier_lock);
>> +       list_for_each_entry(memtier, &memory_tiers, list) {
>> +               if (memtier->adistance_start >= top_tier_adistance)
> 
> It is kind of confusing that a tier with top_tier_adistance is not
> considered as a toptier.  Can we redefine top_tier_adistance to be the
> inclusive upper bound of toptiers?
> 

Agreed. I will fix that up by doing 

top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1;
			

>> +                       break;
>> +               nmask = get_memtier_nodemask(memtier);
>> +               nodes_or(top_tier_mask, top_tier_mask, nmask);
>> +       }
>> +
>> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
>> +       mutex_unlock(&memory_tier_lock);
>> +       return ret;
>> +}
>> +static DEVICE_ATTR_RO(toptier_nodes);
>> +
>> +static struct attribute *memtier_subsys_attrs[] = {
>> +       &dev_attr_toptier_nodes.attr,
>> +       NULL
>> +};
>> +
>> +static const struct attribute_group memtier_subsys_group = {
>> +       .attrs = memtier_subsys_attrs,
>> +};
>> +
>> +static const struct attribute_group *memtier_subsys_groups[] = {
>> +       &memtier_subsys_group,
>> +       NULL
>> +};
>> +
>>  static int __init memory_tier_init(void)
>>  {
>> -       int node;
>> +       int ret, node;
>>         struct memory_tier *memtier;
>>
>> +       ret = subsys_virtual_register(&memory_tier_subsys, memtier_subsys_groups);
>> +       if (ret)
>> +               panic("%s() failed to register memory tier subsystem\n", __func__);
>> +
>>  #ifdef CONFIG_MIGRATION
>>         node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
>>                                 GFP_KERNEL);
>> --
>> 2.37.2
>>
Wei Xu Aug. 30, 2022, 7:17 a.m. UTC | #4
On Mon, Aug 29, 2022 at 11:46 PM Aneesh Kumar K V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 8/30/22 12:01 PM, Wei Xu wrote:
> > On Sun, Aug 28, 2022 at 11:08 PM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
> >> related details can be found. All allocated memory tiers will be listed
> >> there as /sys/devices/virtual/memory_tiering/memory_tierN/
> >>
> >> The nodes which are part of a specific memory tier can be listed via
> >> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
> >>
> >> The abstract distance range value of a specific memory tier can be listed via
> >> /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
> >>
> >> A directory hierarchy looks like
> >> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
> >> memory_tier4/
> >> ├── abstract_distance
> >> ├── nodes
> >> ├── subsystem -> ../../../../bus/memory_tiering
> >> └── uevent
> >>
> >> All toptier nodes are listed via
> >> /sys/devices/virtual/memory_tiering/toptier_nodes
> >>
> >> :/sys/devices/virtual/memory_tiering$ cat toptier_nodes
> >> 0,2
> >> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
> >> 0,2
> >> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
> >> 512 - 639
> >>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  .../ABI/testing/sysfs-kernel-mm-memory-tiers  |  41 +++++
> >>  mm/memory-tiers.c                             | 155 +++++++++++++++---
> >>  2 files changed, 174 insertions(+), 22 deletions(-)
> >>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
> >>
> >> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
> >> new file mode 100644
> >> index 000000000000..6955f69a4423
> >> --- /dev/null
> >> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
> >> @@ -0,0 +1,41 @@
> >> +What:          /sys/devices/virtual/memory_tiering/
> >> +Date:          August 2022
> >> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> >> +Description:   A collection of all the memory tiers allocated.
> >> +
> >> +               Individual memory tier details are contained in subdirectories
> >> +               named by the abstract distance of the memory tier.
> >> +
> >> +               /sys/devices/virtual/memory_tiering/memory_tierN/
> >> +
> >> +
> >> +What:          /sys/devices/virtual/memory_tiering/memory_tierN/
> >> +               /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
> >> +               /sys/devices/virtual/memory_tiering/memory_tierN/nodes
> >> +Date:          August 2022
> >> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> >> +Description:   Directory with details of a specific memory tier
> >> +
> >> +               This is the directory containing information about a particular
> >> +               memory tier, memtierN, where N is derived based on abstract distance.
> >> +
> >> +               A smaller value of N implies a higher (faster) memory tier in the
> >> +               hierarchy.
> >
> > Given that abstract_distance is provided, it would be more flexible if
> > we don't commit to the interface where N in memtierN also indicates
> > the memory tier ordering.
>
>
> IIUC this is one of the request that Johannes had ie, to be able to understand the
> memory tier hierarchy based on memtier name.
>
> >> +
> >> +               abstract_distance: The abstract distance range this specific memory
> >> +               tier maps to.
> >
> > I still think the name of "abstract distance" is kind of confusing
> > because it is not clear what is the other object that this distance
> > value is relative to.  Do we have to expose this value at this point
> > if N in memtierN can already indicate the memory tier ordering?
> >
>
> I do agree that abstract distance is confusing. But IIUC we agreed that it is much better
> than other names suggested and is closer to already understood "numa distance" term.
>
> https://lore.kernel.org/linux-mm/YuLF%2FGG8x5lQvg%2Ff@cmpxchg.org/
>

"NUMA distance" measures the distance between two NUMA nodes.

I bring it up again because this name will become a user visible
kernel interface, which we will need to live with for a long time.
Even if we decide to keep the name, it would be better if we can
define between which two (abstract) points the abstract distance
reports.

Another option is to remove this interface for now until it becomes
necessary to report abstract distances to userspace.

> >> +               nodes: NUMA nodes that are part of this memory tier.
> >> +
> >> +
> >> +What:          /sys/devices/virtual/memory_tiering/toptier_nodes
> >> +Date:          August 2022
> >> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
> >> +Description:   Toptier node mask
> >> +
> >> +               A toptier is defined as the memory tier from which memory promotion
> >> +               is not done by the kernel.
> >> +
> >> +               toptier_nodes: NUMA nodes that are part of all the memory tiers
> >> +               above a topier tier.
> >
> > Nit: topier -> toptier
> >
> > toptier_nodes should be the union of NUMA nodes that are part of each
> > toptier, not above a toptier, right?
> >
>
> I will update the wording. Yes. it is the union of NUMA nodes that are
> part of each toptier.
>
> >> +
> >> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> >> index c4bd6d052a33..d4648d4e4d54 100644
> >> --- a/mm/memory-tiers.c
> >> +++ b/mm/memory-tiers.c
> >> @@ -19,6 +19,7 @@ struct memory_tier {
> >>          * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> >>          */
> >>         int adistance_start;
> >> +       struct device dev;
> >>         /* All the nodes that are part of all the lower memory tiers. */
> >>         nodemask_t lower_tier_mask;
> >>  };
> >> @@ -36,6 +37,13 @@ static DEFINE_MUTEX(memory_tier_lock);
> >>  static LIST_HEAD(memory_tiers);
> >>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> >>  static struct memory_dev_type *default_dram_type;
> >> +
> >> +#define to_memory_tier(device) container_of(device, struct memory_tier, dev)
> >> +static struct bus_type memory_tier_subsys = {
> >> +       .name = "memory_tiering",
> >> +       .dev_name = "memory_tier",
> >> +};
> >> +
> >>  #ifdef CONFIG_MIGRATION
> >>  static int top_tier_adistance;
> >>  /*
> >> @@ -98,8 +106,73 @@ static int top_tier_adistance;
> >>  static struct demotion_nodes *node_demotion __read_mostly;
> >>  #endif /* CONFIG_MIGRATION */
> >>
> >> +static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> >> +{
> >> +       nodemask_t nodes = NODE_MASK_NONE;
> >> +       struct memory_dev_type *memtype;
> >> +
> >> +       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
> >> +               nodes_or(nodes, nodes, memtype->nodes);
> >> +
> >> +       return nodes;
> >> +}
> >> +
> >> +static void memory_tier_device_release(struct device *dev)
> >> +{
> >> +       struct memory_tier *tier = to_memory_tier(dev);
> >> +       /*
> >> +        * synchronize_rcu in clear_node_memory_tier makes sure
> >> +        * we don't have rcu access to this memory tier.
> >> +        */
> >> +       kfree(tier);
> >> +}
> >> +
> >> +static ssize_t nodes_show(struct device *dev,
> >> +                         struct device_attribute *attr, char *buf)
> >> +{
> >> +       int ret;
> >> +       nodemask_t nmask;
> >> +
> >> +       mutex_lock(&memory_tier_lock);
> >> +       nmask = get_memtier_nodemask(to_memory_tier(dev));
> >> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
> >> +       mutex_unlock(&memory_tier_lock);
> >> +       return ret;
> >> +}
> >> +static DEVICE_ATTR_RO(nodes);
> >> +
> >> +static ssize_t abstract_distance_show(struct device *dev,
> >> +                                     struct device_attribute *attr, char *buf)
> >> +{
> >> +       int ret;
> >> +       struct memory_tier *memtier = to_memory_tier(dev);
> >> +
> >> +       mutex_lock(&memory_tier_lock);
> >> +       ret = sysfs_emit(buf, "%d - %d\n", memtier->adistance_start,
> >> +                        memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1);
> >> +       mutex_unlock(&memory_tier_lock);
> >> +       return ret;
> >> +}
> >> +static DEVICE_ATTR_RO(abstract_distance);
> >> +
> >> +static struct attribute *memtier_dev_attrs[] = {
> >> +       &dev_attr_nodes.attr,
> >> +       &dev_attr_abstract_distance.attr,
> >> +       NULL
> >> +};
> >> +
> >> +static const struct attribute_group memtier_dev_group = {
> >> +       .attrs = memtier_dev_attrs,
> >> +};
> >> +
> >> +static const struct attribute_group *memtier_dev_groups[] = {
> >> +       &memtier_dev_group,
> >> +       NULL
> >> +};
> >> +
> >>  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
> >>  {
> >> +       int ret;
> >>         bool found_slot = false;
> >>         struct memory_tier *memtier, *new_memtier;
> >>         int adistance = memtype->adistance;
> >> @@ -123,15 +196,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
> >>
> >>         list_for_each_entry(memtier, &memory_tiers, list) {
> >>                 if (adistance == memtier->adistance_start) {
> >> -                       list_add(&memtype->tier_sibiling, &memtier->memory_types);
> >> -                       return memtier;
> >> +                       goto link_memtype;
> >>                 } else if (adistance < memtier->adistance_start) {
> >>                         found_slot = true;
> >>                         break;
> >>                 }
> >>         }
> >>
> >> -       new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
> >> +       new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
> >>         if (!new_memtier)
> >>                 return ERR_PTR(-ENOMEM);
> >>
> >> @@ -142,8 +214,23 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
> >>                 list_add_tail(&new_memtier->list, &memtier->list);
> >>         else
> >>                 list_add_tail(&new_memtier->list, &memory_tiers);
> >> -       list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
> >> -       return new_memtier;
> >> +
> >> +       new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
> >> +       new_memtier->dev.bus = &memory_tier_subsys;
> >> +       new_memtier->dev.release = memory_tier_device_release;
> >> +       new_memtier->dev.groups = memtier_dev_groups;
> >> +
> >> +       ret = device_register(&new_memtier->dev);
> >> +       if (ret) {
> >> +               list_del(&memtier->list);
> >> +               put_device(&memtier->dev);
> >> +               return ERR_PTR(ret);
> >> +       }
> >> +       memtier = new_memtier;
> >> +
> >> +link_memtype:
> >> +       list_add(&memtype->tier_sibiling, &memtier->memory_types);
> >> +       return memtier;
> >>  }
> >>
> >>  static struct memory_tier *__node_get_memory_tier(int node)
> >> @@ -275,17 +362,6 @@ static void disable_all_demotion_targets(void)
> >>         synchronize_rcu();
> >>  }
> >>
> >> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> >> -{
> >> -       nodemask_t nodes = NODE_MASK_NONE;
> >> -       struct memory_dev_type *memtype;
> >> -
> >> -       list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
> >> -               nodes_or(nodes, nodes, memtype->nodes);
> >> -
> >> -       return nodes;
> >> -}
> >> -
> >>  /*
> >>   * Find an automatic demotion target for all memory
> >>   * nodes. Failing here is OK.  It might just indicate
> >> @@ -432,11 +508,7 @@ static struct memory_tier *set_node_memory_tier(int node)
> >>  static void destroy_memory_tier(struct memory_tier *memtier)
> >>  {
> >>         list_del(&memtier->list);
> >> -       /*
> >> -        * synchronize_rcu in clear_node_memory_tier makes sure
> >> -        * we don't have rcu access to this memory tier.
> >> -        */
> >> -       kfree(memtier);
> >> +       device_unregister(&memtier->dev);
> >>  }
> >>
> >>  static bool clear_node_memory_tier(int node)
> >> @@ -563,11 +635,50 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
> >>         return notifier_from_errno(0);
> >>  }
> >>
> >> +static ssize_t toptier_nodes_show(struct device *dev,
> >> +                                    struct device_attribute *attr, char *buf)
> >> +{
> >> +       int ret;
> >> +       nodemask_t nmask, top_tier_mask = NODE_MASK_NONE;
> >> +       struct memory_tier *memtier = to_memory_tier(dev);
> >> +
> >> +       mutex_lock(&memory_tier_lock);
> >> +       list_for_each_entry(memtier, &memory_tiers, list) {
> >> +               if (memtier->adistance_start >= top_tier_adistance)
> >
> > It is kind of confusing that a tier with top_tier_adistance is not
> > considered as a toptier.  Can we redefine top_tier_adistance to be the
> > inclusive upper bound of toptiers?
> >
>
> Agreed. I will fix that up by doing
>
> top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1;
>
>
> >> +                       break;
> >> +               nmask = get_memtier_nodemask(memtier);
> >> +               nodes_or(top_tier_mask, top_tier_mask, nmask);
> >> +       }
> >> +
> >> +       ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
> >> +       mutex_unlock(&memory_tier_lock);
> >> +       return ret;
> >> +}
> >> +static DEVICE_ATTR_RO(toptier_nodes);
> >> +
> >> +static struct attribute *memtier_subsys_attrs[] = {
> >> +       &dev_attr_toptier_nodes.attr,
> >> +       NULL
> >> +};
> >> +
> >> +static const struct attribute_group memtier_subsys_group = {
> >> +       .attrs = memtier_subsys_attrs,
> >> +};
> >> +
> >> +static const struct attribute_group *memtier_subsys_groups[] = {
> >> +       &memtier_subsys_group,
> >> +       NULL
> >> +};
> >> +
> >>  static int __init memory_tier_init(void)
> >>  {
> >> -       int node;
> >> +       int ret, node;
> >>         struct memory_tier *memtier;
> >>
> >> +       ret = subsys_virtual_register(&memory_tier_subsys, memtier_subsys_groups);
> >> +       if (ret)
> >> +               panic("%s() failed to register memory tier subsystem\n", __func__);
> >> +
> >>  #ifdef CONFIG_MIGRATION
> >>         node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
> >>                                 GFP_KERNEL);
> >> --
> >> 2.37.2
> >>
>
Aneesh Kumar K.V Aug. 30, 2022, 7:28 a.m. UTC | #5
On 8/30/22 12:47 PM, Wei Xu wrote:
> On Mon, Aug 29, 2022 at 11:46 PM Aneesh Kumar K V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 8/30/22 12:01 PM, Wei Xu wrote:
>>> On Sun, Aug 28, 2022 at 11:08 PM Aneesh Kumar K.V
>>> <aneesh.kumar@linux.ibm.com> wrote:
>>>>
>>>> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
>>>> related details can be found. All allocated memory tiers will be listed
>>>> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>>>>
>>>> The nodes which are part of a specific memory tier can be listed via
>>>> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>>>>
>>>> The abstract distance range value of a specific memory tier can be listed via
>>>> /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>>>>
>>>> A directory hierarchy looks like
>>>> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
>>>> memory_tier4/
>>>> ├── abstract_distance
>>>> ├── nodes
>>>> ├── subsystem -> ../../../../bus/memory_tiering
>>>> └── uevent
>>>>
>>>> All toptier nodes are listed via
>>>> /sys/devices/virtual/memory_tiering/toptier_nodes
>>>>
>>>> :/sys/devices/virtual/memory_tiering$ cat toptier_nodes
>>>> 0,2
>>>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
>>>> 0,2
>>>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/abstract_distance
>>>> 512 - 639
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>>  .../ABI/testing/sysfs-kernel-mm-memory-tiers  |  41 +++++
>>>>  mm/memory-tiers.c                             | 155 +++++++++++++++---
>>>>  2 files changed, 174 insertions(+), 22 deletions(-)
>>>>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>>>>
>>>> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>>>> new file mode 100644
>>>> index 000000000000..6955f69a4423
>>>> --- /dev/null
>>>> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
>>>> @@ -0,0 +1,41 @@
>>>> +What:          /sys/devices/virtual/memory_tiering/
>>>> +Date:          August 2022
>>>> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>>>> +Description:   A collection of all the memory tiers allocated.
>>>> +
>>>> +               Individual memory tier details are contained in subdirectories
>>>> +               named by the abstract distance of the memory tier.
>>>> +
>>>> +               /sys/devices/virtual/memory_tiering/memory_tierN/
>>>> +
>>>> +
>>>> +What:          /sys/devices/virtual/memory_tiering/memory_tierN/
>>>> +               /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>>>> +               /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>>>> +Date:          August 2022
>>>> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>>>> +Description:   Directory with details of a specific memory tier
>>>> +
>>>> +               This is the directory containing information about a particular
>>>> +               memory tier, memtierN, where N is derived based on abstract distance.
>>>> +
>>>> +               A smaller value of N implies a higher (faster) memory tier in the
>>>> +               hierarchy.
>>>
>>> Given that abstract_distance is provided, it would be more flexible if
>>> we don't commit to the interface where N in memtierN also indicates
>>> the memory tier ordering.
>>
>>
>> IIUC this is one of the request that Johannes had ie, to be able to understand the
>> memory tier hierarchy based on memtier name.
>>
>>>> +
>>>> +               abstract_distance: The abstract distance range this specific memory
>>>> +               tier maps to.
>>>
>>> I still think the name of "abstract distance" is kind of confusing
>>> because it is not clear what is the other object that this distance
>>> value is relative to.  Do we have to expose this value at this point
>>> if N in memtierN can already indicate the memory tier ordering?
>>>
>>
>> I do agree that abstract distance is confusing. But IIUC we agreed that it is much better
>> than other names suggested and is closer to already understood "numa distance" term.
>>
>> https://lore.kernel.org/linux-mm/YuLF%2FGG8x5lQvg%2Ff@cmpxchg.org/
>>
> 
> "NUMA distance" measures the distance between two NUMA nodes.
> 
> I bring it up again because this name will become a user visible
> kernel interface, which we will need to live with for a long time.
> Even if we decide to keep the name, it would be better if we can
> define between which two (abstract) points the abstract distance
> reports.
> 
> Another option is to remove this interface for now until it becomes
> necessary to report abstract distances to userspace.
> 

Ok I will send a v3 with abstract_distance dropped.

-aneesh
Huang, Ying Sept. 1, 2022, 6:37 a.m. UTC | #6
Wei Xu <weixugc@google.com> writes:

> On Mon, Aug 29, 2022 at 11:46 PM Aneesh Kumar K V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 8/30/22 12:01 PM, Wei Xu wrote:
>> > On Sun, Aug 28, 2022 at 11:08 PM Aneesh Kumar K.V
>> > <aneesh.kumar@linux.ibm.com> wrote:

[snip]

>> >>
>> >> +
>> >> +What:          /sys/devices/virtual/memory_tiering/memory_tierN/
>> >> +               /sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
>> >> +               /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>> >> +Date:          August 2022
>> >> +Contact:       Linux memory management mailing list <linux-mm@kvack.org>
>> >> +Description:   Directory with details of a specific memory tier
>> >> +
>> >> +               This is the directory containing information about a particular
>> >> +               memory tier, memtierN, where N is derived based on abstract distance.
>> >> +
>> >> +               A smaller value of N implies a higher (faster) memory tier in the
>> >> +               hierarchy.
>> >
>> > Given that abstract_distance is provided, it would be more flexible if
>> > we don't commit to the interface where N in memtierN also indicates
>> > the memory tier ordering.
>>
>>
>> IIUC this is one of the request that Johannes had ie, to be able to understand the
>> memory tier hierarchy based on memtier name.
>>
>> >> +
>> >> +               abstract_distance: The abstract distance range this specific memory
>> >> +               tier maps to.
>> >
>> > I still think the name of "abstract distance" is kind of confusing
>> > because it is not clear what is the other object that this distance
>> > value is relative to.  Do we have to expose this value at this point
>> > if N in memtierN can already indicate the memory tier ordering?
>> >
>>
>> I do agree that abstract distance is confusing. But IIUC we agreed that it is much better
>> than other names suggested and is closer to already understood "numa distance" term.
>>
>> https://lore.kernel.org/linux-mm/YuLF%2FGG8x5lQvg%2Ff@cmpxchg.org/
>>
>
> "NUMA distance" measures the distance between two NUMA nodes.

Per my understanding, "NUMA distance" measures the distance between the
CPUs of one NUMA node to the memory of another NUMA node.

> I bring it up again because this name will become a user visible
> kernel interface, which we will need to live with for a long time.
> Even if we decide to keep the name, it would be better if we can
> define between which two (abstract) points the abstract distance
> reports.

My opinion is that "abstract distance" reflects the distance
(latency+bandwidth) between the CPUs of one NUMA socket to a type of
memory in the same NUMA socket.

Hi, Johannes,

What do you think about this?  You are the inventor of the "abstract
distance".  Can you elaborate its definition?

Best Regards,
Huang, Ying

> Another option is to remove this interface for now until it becomes
> necessary to report abstract distances to userspace.

[snip]
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
new file mode 100644
index 000000000000..6955f69a4423
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
@@ -0,0 +1,41 @@ 
+What:		/sys/devices/virtual/memory_tiering/
+Date:		August 2022
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	A collection of all the memory tiers allocated.
+
+		Individual memory tier details are contained in subdirectories
+		named by the abstract distance of the memory tier.
+
+		/sys/devices/virtual/memory_tiering/memory_tierN/
+
+
+What:		/sys/devices/virtual/memory_tiering/memory_tierN/
+		/sys/devices/virtual/memory_tiering/memory_tierN/abstract_distance
+		/sys/devices/virtual/memory_tiering/memory_tierN/nodes
+Date:		August 2022
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Directory with details of a specific memory tier
+
+		This is the directory containing information about a particular
+		memory tier, memtierN, where N is derived based on abstract distance.
+
+		A smaller value of N implies a higher (faster) memory tier in the
+		hierarchy.
+
+		abstract_distance: The abstract distance range this specific memory
+		tier maps to.
+
+		nodes: NUMA nodes that are part of this memory tier.
+
+
+What:		/sys/devices/virtual/memory_tiering/toptier_nodes
+Date:		August 2022
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Toptier node mask
+
+		A toptier is defined as the memory tier from which memory promotion
+		is not done by the kernel.
+
+		toptier_nodes: NUMA nodes that are part of all the memory tiers
+		above a topier tier.
+
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c4bd6d052a33..d4648d4e4d54 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -19,6 +19,7 @@  struct memory_tier {
 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
 	 */
 	int adistance_start;
+	struct device dev;
 	/* All the nodes that are part of all the lower memory tiers. */
 	nodemask_t lower_tier_mask;
 };
@@ -36,6 +37,13 @@  static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 static struct memory_dev_type *default_dram_type;
+
+#define to_memory_tier(device) container_of(device, struct memory_tier, dev)
+static struct bus_type memory_tier_subsys = {
+	.name = "memory_tiering",
+	.dev_name = "memory_tier",
+};
+
 #ifdef CONFIG_MIGRATION
 static int top_tier_adistance;
 /*
@@ -98,8 +106,73 @@  static int top_tier_adistance;
 static struct demotion_nodes *node_demotion __read_mostly;
 #endif /* CONFIG_MIGRATION */
 
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+	nodemask_t nodes = NODE_MASK_NONE;
+	struct memory_dev_type *memtype;
+
+	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+		nodes_or(nodes, nodes, memtype->nodes);
+
+	return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+	struct memory_tier *tier = to_memory_tier(dev);
+	/*
+	 * synchronize_rcu in clear_node_memory_tier makes sure
+	 * we don't have rcu access to this memory tier.
+	 */
+	kfree(tier);
+}
+
+static ssize_t nodes_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	int ret;
+	nodemask_t nmask;
+
+	mutex_lock(&memory_tier_lock);
+	nmask = get_memtier_nodemask(to_memory_tier(dev));
+	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+	mutex_unlock(&memory_tier_lock);
+	return ret;
+}
+static DEVICE_ATTR_RO(nodes);
+
+static ssize_t abstract_distance_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	int ret;
+	struct memory_tier *memtier = to_memory_tier(dev);
+
+	mutex_lock(&memory_tier_lock);
+	ret = sysfs_emit(buf, "%d - %d\n", memtier->adistance_start,
+			 memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1);
+	mutex_unlock(&memory_tier_lock);
+	return ret;
+}
+static DEVICE_ATTR_RO(abstract_distance);
+
+static struct attribute *memtier_dev_attrs[] = {
+	&dev_attr_nodes.attr,
+	&dev_attr_abstract_distance.attr,
+	NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+	.attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+	&memtier_dev_group,
+	NULL
+};
+
 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
 {
+	int ret;
 	bool found_slot = false;
 	struct memory_tier *memtier, *new_memtier;
 	int adistance = memtype->adistance;
@@ -123,15 +196,14 @@  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 
 	list_for_each_entry(memtier, &memory_tiers, list) {
 		if (adistance == memtier->adistance_start) {
-			list_add(&memtype->tier_sibiling, &memtier->memory_types);
-			return memtier;
+			goto link_memtype;
 		} else if (adistance < memtier->adistance_start) {
 			found_slot = true;
 			break;
 		}
 	}
 
-	new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
+	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
 	if (!new_memtier)
 		return ERR_PTR(-ENOMEM);
 
@@ -142,8 +214,23 @@  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 		list_add_tail(&new_memtier->list, &memtier->list);
 	else
 		list_add_tail(&new_memtier->list, &memory_tiers);
-	list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
-	return new_memtier;
+
+	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+	new_memtier->dev.bus = &memory_tier_subsys;
+	new_memtier->dev.release = memory_tier_device_release;
+	new_memtier->dev.groups = memtier_dev_groups;
+
+	ret = device_register(&new_memtier->dev);
+	if (ret) {
+		list_del(&memtier->list);
+		put_device(&memtier->dev);
+		return ERR_PTR(ret);
+	}
+	memtier = new_memtier;
+
+link_memtype:
+	list_add(&memtype->tier_sibiling, &memtier->memory_types);
+	return memtier;
 }
 
 static struct memory_tier *__node_get_memory_tier(int node)
@@ -275,17 +362,6 @@  static void disable_all_demotion_targets(void)
 	synchronize_rcu();
 }
 
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
-{
-	nodemask_t nodes = NODE_MASK_NONE;
-	struct memory_dev_type *memtype;
-
-	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
-		nodes_or(nodes, nodes, memtype->nodes);
-
-	return nodes;
-}
-
 /*
  * Find an automatic demotion target for all memory
  * nodes. Failing here is OK.  It might just indicate
@@ -432,11 +508,7 @@  static struct memory_tier *set_node_memory_tier(int node)
 static void destroy_memory_tier(struct memory_tier *memtier)
 {
 	list_del(&memtier->list);
-	/*
-	 * synchronize_rcu in clear_node_memory_tier makes sure
-	 * we don't have rcu access to this memory tier.
-	 */
-	kfree(memtier);
+	device_unregister(&memtier->dev);
 }
 
 static bool clear_node_memory_tier(int node)
@@ -563,11 +635,50 @@  static int __meminit memtier_hotplug_callback(struct notifier_block *self,
 	return notifier_from_errno(0);
 }
 
+static ssize_t toptier_nodes_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	int ret;
+	nodemask_t nmask, top_tier_mask = NODE_MASK_NONE;
+	struct memory_tier *memtier = to_memory_tier(dev);
+
+	mutex_lock(&memory_tier_lock);
+	list_for_each_entry(memtier, &memory_tiers, list) {
+		if (memtier->adistance_start >= top_tier_adistance)
+			break;
+		nmask = get_memtier_nodemask(memtier);
+		nodes_or(top_tier_mask, top_tier_mask, nmask);
+	}
+
+	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
+	mutex_unlock(&memory_tier_lock);
+	return ret;
+}
+static DEVICE_ATTR_RO(toptier_nodes);
+
+static struct attribute *memtier_subsys_attrs[] = {
+	&dev_attr_toptier_nodes.attr,
+	NULL
+};
+
+static const struct attribute_group memtier_subsys_group = {
+	.attrs = memtier_subsys_attrs,
+};
+
+static const struct attribute_group *memtier_subsys_groups[] = {
+	&memtier_subsys_group,
+	NULL
+};
+
 static int __init memory_tier_init(void)
 {
-	int node;
+	int ret, node;
 	struct memory_tier *memtier;
 
+	ret = subsys_virtual_register(&memory_tier_subsys, memtier_subsys_groups);
+	if (ret)
+		panic("%s() failed to register memory tier subsystem\n", __func__);
+
 #ifdef CONFIG_MIGRATION
 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
 				GFP_KERNEL);