diff mbox series

[v11,4/8] mm/demotion/dax/kmem: Set node's abstract distance to MEMTIER_ADISTANCE_PMEM

Message ID 20220728190436.858458-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V July 28, 2022, 7:04 p.m. UTC
By default, all nodes are assigned to the default memory tier which
is the memory tier designated for nodes with DRAM

Set dax kmem device node's tier to slower memory tier by assigning
abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
appears below the default memory tier in demotion order.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 drivers/dax/kmem.c           |  9 +++++++++
 include/linux/memory-tiers.h | 19 ++++++++++++++++++-
 mm/memory-tiers.c            | 28 ++++++++++++++++------------
 3 files changed, 43 insertions(+), 13 deletions(-)

Comments

Huang, Ying July 29, 2022, 6:20 a.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> By default, all nodes are assigned to the default memory tier which
> is the memory tier designated for nodes with DRAM
>
> Set dax kmem device node's tier to slower memory tier by assigning
> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
> appears below the default memory tier in demotion order.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  drivers/dax/kmem.c           |  9 +++++++++
>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>  3 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index a37622060fff..6b0d5de9a3e9 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -11,6 +11,7 @@
>  #include <linux/fs.h>
>  #include <linux/mm.h>
>  #include <linux/mman.h>
> +#include <linux/memory-tiers.h>
>  #include "dax-private.h"
>  #include "bus.h"
>  
> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>  	struct resource *res[];
>  };
>  
> +static struct memory_dev_type default_pmem_type  = {

Why is this named as default_pmem_type?  We will not change the memory
type of a node usually.

> +	.adistance = MEMTIER_ADISTANCE_PMEM,
> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
> +	.nodes  = NODE_MASK_NONE,
> +};
> +
>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>  {
>  	struct device *dev = &dev_dax->dev;
> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>  		return -EINVAL;
>  	}
>  
> +	init_node_memory_type(numa_node, &default_pmem_type);
> +

The memory hot-add below may fail.  So the error handling needs to be
added.

And, it appears that the memory type and memory tier of a node may be
fully initialized here before NUMA hot-adding started.  So I suggest to
set node_memory_types[] here only.  And set memory_dev_type->nodes in
node hot-add callback.  I think there is the proper place to complete
the initialization.

And, in theory dax/kmem.c can be unloaded.  So we need to clear
node_memory_types[] for nodes somewhere.

Best Regards,
Huang, Ying

>  	for (i = 0; i < dev_dax->nr_range; i++) {
>  		struct range range;
>  
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index 976f43a5e3be..4f4baf0bf430 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,8 @@
>  #ifndef _LINUX_MEMORY_TIERS_H
>  #define _LINUX_MEMORY_TIERS_H
>  
> +#include <linux/types.h>
> +#include <linux/nodemask.h>
>  /*
>   * Each tier cover a abstrace distance chunk size of 128
>   */
> @@ -15,12 +17,27 @@
>  #define MEMTIER_ADISTANCE_PMEM	(1 << MEMTIER_CHUNK_BITS)
>  #define MEMTIER_HOTPLUG_PRIO	100
>  
> +struct memory_tier;
> +struct memory_dev_type {
> +	/* list of memory types that are are part of same tier as this type */
> +	struct list_head tier_sibiling;
> +	/* abstract distance for this specific memory type */
> +	int adistance;
> +	/* Nodes of same abstract distance */
> +	nodemask_t nodes;
> +	struct memory_tier *memtier;
> +};
> +
>  #ifdef CONFIG_NUMA
> -#include <linux/types.h>
>  extern bool numa_demotion_enabled;
> +struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type);
>  
>  #else
>  
>  #define numa_demotion_enabled	false
> +static inline struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
>  #endif	/* CONFIG_NUMA */
>  #endif  /* _LINUX_MEMORY_TIERS_H */
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index c9854a394d9b..109be75fa554 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -1,6 +1,4 @@
>  // SPDX-License-Identifier: GPL-2.0
> -#include <linux/types.h>
> -#include <linux/nodemask.h>
>  #include <linux/slab.h>
>  #include <linux/lockdep.h>
>  #include <linux/memory.h>
> @@ -19,16 +17,6 @@ struct memory_tier {
>  	int adistance_start;
>  };
>  
> -struct memory_dev_type {
> -	/* list of memory types that are are part of same tier as this type */
> -	struct list_head tier_sibiling;
> -	/* abstract distance for this specific memory type */
> -	int adistance;
> -	/* Nodes of same abstract distance */
> -	nodemask_t nodes;
> -	struct memory_tier *memtier;
> -};
> -
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
>  struct memory_dev_type *node_memory_types[MAX_NUMNODES];
> @@ -141,6 +129,22 @@ static void clear_node_memory_tier(int node)
>  	mutex_unlock(&memory_tier_lock);
>  }
>  
> +struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type)
> +{
> +	struct memory_dev_type *mem_type;
> +
> +	mutex_lock(&memory_tier_lock);
> +	if (node_memory_types[node]) {
> +		mem_type = node_memory_types[node];
> +	} else {
> +		node_memory_types[node] = default_type;
> +		node_set(node, default_type->nodes);
> +		mem_type = default_type;
> +	}
> +	mutex_unlock(&memory_tier_lock);
> +	return mem_type;
> +}
> +
>  static int __meminit memtier_hotplug_callback(struct notifier_block *self,
>  					      unsigned long action, void *_arg)
>  {
Aneesh Kumar K.V July 29, 2022, 7:19 a.m. UTC | #2
"Huang, Ying" <ying.huang@intel.com> writes:

> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>
>> By default, all nodes are assigned to the default memory tier which
>> is the memory tier designated for nodes with DRAM
>>
>> Set dax kmem device node's tier to slower memory tier by assigning
>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>> appears below the default memory tier in demotion order.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  drivers/dax/kmem.c           |  9 +++++++++
>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>> index a37622060fff..6b0d5de9a3e9 100644
>> --- a/drivers/dax/kmem.c
>> +++ b/drivers/dax/kmem.c
>> @@ -11,6 +11,7 @@
>>  #include <linux/fs.h>
>>  #include <linux/mm.h>
>>  #include <linux/mman.h>
>> +#include <linux/memory-tiers.h>
>>  #include "dax-private.h"
>>  #include "bus.h"
>>  
>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>  	struct resource *res[];
>>  };
>>  
>> +static struct memory_dev_type default_pmem_type  = {
>
> Why is this named as default_pmem_type?  We will not change the memory
> type of a node usually.
>

Any other suggestion? pmem_dev_type? 


>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>> +	.nodes  = NODE_MASK_NONE,
>> +};
>> +
>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>  {
>>  	struct device *dev = &dev_dax->dev;
>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>  		return -EINVAL;
>>  	}
>>  
>> +	init_node_memory_type(numa_node, &default_pmem_type);
>> +
>
> The memory hot-add below may fail.  So the error handling needs to be
> added.
>
> And, it appears that the memory type and memory tier of a node may be
> fully initialized here before NUMA hot-adding started.  So I suggest to
> set node_memory_types[] here only.  And set memory_dev_type->nodes in
> node hot-add callback.  I think there is the proper place to complete
> the initialization.
>
> And, in theory dax/kmem.c can be unloaded.  So we need to clear
> node_memory_types[] for nodes somewhere.
>

I guess by module exit we can be sure that all the memory managed
by dax/kmem is hotplugged out. How about something like below?

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 6b0d5de9a3e9..eb4e158012a9 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -248,6 +248,7 @@ static void __exit dax_kmem_exit(void)
 	dax_driver_unregister(&device_dax_kmem_driver);
 	if (!any_hotremove_failed)
 		kfree_const(kmem_name);
+	unregister_memory_type(&default_pmem_type);
 }
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc6b7a14da51..8355baf5b8b4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -31,6 +31,7 @@ struct memory_dev_type {
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
+void unregister_memory_type(struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node)
 #define numa_demotion_enabled	false
 static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
 {
+}
+
+static inline void unregister_memory_type(struct memory_dev_type *memtype)
+{
 
 }
 
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 064e0f932795..4d29ebd4c4f3 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -500,6 +500,28 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type)
 	mutex_unlock(&memory_tier_lock);
 }
 
+void unregister_memory_type(struct memory_dev_type *memtype)
+{
+	int node;
+	struct memory_tier *memtier = memtype->memtier;
+
+	mutex_lock(&memory_tier_lock);
+	for(node = 0; node < MAX_NUMNODES; node++) {
+		if (node_memory_types[node] == memtype) {
+			if (!nodes_empty(memtype->nodes))
+				WARN_ON(1);
+			node_memory_types[node] = NULL;
+		}
+	}
+
+	list_del(&memtype->tier_sibiling);
+	memtype->memtier = NULL;
+	if (list_empty(&memtier->memory_types))
+		destroy_memory_tier(memtier);
+
+	mutex_unlock(&memory_tier_lock);
+}
+
 void update_node_adistance(int node, struct memory_dev_type *memtype)
 {
 	pg_data_t *pgdat;
Huang, Ying Aug. 1, 2022, 2:06 a.m. UTC | #3
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> "Huang, Ying" <ying.huang@intel.com> writes:
>
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>
>>> By default, all nodes are assigned to the default memory tier which
>>> is the memory tier designated for nodes with DRAM
>>>
>>> Set dax kmem device node's tier to slower memory tier by assigning
>>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>>> appears below the default memory tier in demotion order.
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>>  drivers/dax/kmem.c           |  9 +++++++++
>>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>>> index a37622060fff..6b0d5de9a3e9 100644
>>> --- a/drivers/dax/kmem.c
>>> +++ b/drivers/dax/kmem.c
>>> @@ -11,6 +11,7 @@
>>>  #include <linux/fs.h>
>>>  #include <linux/mm.h>
>>>  #include <linux/mman.h>
>>> +#include <linux/memory-tiers.h>
>>>  #include "dax-private.h"
>>>  #include "bus.h"
>>>  
>>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>>  	struct resource *res[];
>>>  };
>>>  
>>> +static struct memory_dev_type default_pmem_type  = {
>>
>> Why is this named as default_pmem_type?  We will not change the memory
>> type of a node usually.
>>
>
> Any other suggestion? pmem_dev_type? 

Or dax_pmem_type?

DAX is used to enumerate the memory device.

>
>>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>>> +	.nodes  = NODE_MASK_NONE,
>>> +};
>>> +
>>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>  {
>>>  	struct device *dev = &dev_dax->dev;
>>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>  		return -EINVAL;
>>>  	}
>>>  
>>> +	init_node_memory_type(numa_node, &default_pmem_type);
>>> +
>>
>> The memory hot-add below may fail.  So the error handling needs to be
>> added.
>>
>> And, it appears that the memory type and memory tier of a node may be
>> fully initialized here before NUMA hot-adding started.  So I suggest to
>> set node_memory_types[] here only.  And set memory_dev_type->nodes in
>> node hot-add callback.  I think there is the proper place to complete
>> the initialization.
>>
>> And, in theory dax/kmem.c can be unloaded.  So we need to clear
>> node_memory_types[] for nodes somewhere.
>>
>
> I guess by module exit we can be sure that all the memory managed
> by dax/kmem is hotplugged out. How about something like below?

Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's
natural to clear it in dev_dax_kmem_remove().

Best Regards,
Huang, Ying

> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 6b0d5de9a3e9..eb4e158012a9 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -248,6 +248,7 @@ static void __exit dax_kmem_exit(void)
>  	dax_driver_unregister(&device_dax_kmem_driver);
>  	if (!any_hotremove_failed)
>  		kfree_const(kmem_name);
> +	unregister_memory_type(&default_pmem_type);
>  }
>  
>  MODULE_AUTHOR("Intel Corporation");
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index fc6b7a14da51..8355baf5b8b4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -31,6 +31,7 @@ struct memory_dev_type {
>  #ifdef CONFIG_NUMA
>  extern bool numa_demotion_enabled;
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
> +void unregister_memory_type(struct memory_dev_type *memtype);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> @@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node)
>  #define numa_demotion_enabled	false
>  static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
>  {
> +}
> +
> +static inline void unregister_memory_type(struct memory_dev_type *memtype)
> +{
>  
>  }
>  
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 064e0f932795..4d29ebd4c4f3 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -500,6 +500,28 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type)
>  	mutex_unlock(&memory_tier_lock);
>  }
>  
> +void unregister_memory_type(struct memory_dev_type *memtype)
> +{
> +	int node;
> +	struct memory_tier *memtier = memtype->memtier;
> +
> +	mutex_lock(&memory_tier_lock);
> +	for(node = 0; node < MAX_NUMNODES; node++) {
> +		if (node_memory_types[node] == memtype) {
> +			if (!nodes_empty(memtype->nodes))
> +				WARN_ON(1);
> +			node_memory_types[node] = NULL;
> +		}
> +	}
> +
> +	list_del(&memtype->tier_sibiling);
> +	memtype->memtier = NULL;
> +	if (list_empty(&memtier->memory_types))
> +		destroy_memory_tier(memtier);
> +
> +	mutex_unlock(&memory_tier_lock);
> +}
> +
>  void update_node_adistance(int node, struct memory_dev_type *memtype)
>  {
>  	pg_data_t *pgdat;
Aneesh Kumar K.V Aug. 1, 2022, 4:40 a.m. UTC | #4
On 8/1/22 7:36 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> 
>> "Huang, Ying" <ying.huang@intel.com> writes:
>>
>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>
>>>> By default, all nodes are assigned to the default memory tier which
>>>> is the memory tier designated for nodes with DRAM
>>>>
>>>> Set dax kmem device node's tier to slower memory tier by assigning
>>>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>>>> appears below the default memory tier in demotion order.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>>  drivers/dax/kmem.c           |  9 +++++++++
>>>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>>>> index a37622060fff..6b0d5de9a3e9 100644
>>>> --- a/drivers/dax/kmem.c
>>>> +++ b/drivers/dax/kmem.c
>>>> @@ -11,6 +11,7 @@
>>>>  #include <linux/fs.h>
>>>>  #include <linux/mm.h>
>>>>  #include <linux/mman.h>
>>>> +#include <linux/memory-tiers.h>
>>>>  #include "dax-private.h"
>>>>  #include "bus.h"
>>>>  
>>>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>>>  	struct resource *res[];
>>>>  };
>>>>  
>>>> +static struct memory_dev_type default_pmem_type  = {
>>>
>>> Why is this named as default_pmem_type?  We will not change the memory
>>> type of a node usually.
>>>
>>
>> Any other suggestion? pmem_dev_type? 
> 
> Or dax_pmem_type?
> 
> DAX is used to enumerate the memory device.
> 
>>
>>>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>>>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>>>> +	.nodes  = NODE_MASK_NONE,
>>>> +};
>>>> +
>>>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>  {
>>>>  	struct device *dev = &dev_dax->dev;
>>>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>  		return -EINVAL;
>>>>  	}
>>>>  
>>>> +	init_node_memory_type(numa_node, &default_pmem_type);
>>>> +
>>>
>>> The memory hot-add below may fail.  So the error handling needs to be
>>> added.
>>>
>>> And, it appears that the memory type and memory tier of a node may be
>>> fully initialized here before NUMA hot-adding started.  So I suggest to
>>> set node_memory_types[] here only.  And set memory_dev_type->nodes in
>>> node hot-add callback.  I think there is the proper place to complete
>>> the initialization.
>>>
>>> And, in theory dax/kmem.c can be unloaded.  So we need to clear
>>> node_memory_types[] for nodes somewhere.
>>>
>>
>> I guess by module exit we can be sure that all the memory managed
>> by dax/kmem is hotplugged out. How about something like below?
> 
> Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's
> natural to clear it in dev_dax_kmem_remove().
> 

Most of required reset/clear is done as part of memory hotunplug. So
if we did manage to successfully unplug the memory, everything except
node_memory_types[node] should be reset. That makes the clear_node_memory_type
the below. 

void clear_node_memory_type(int node, struct memory_dev_type *memtype)
{

	mutex_lock(&memory_tier_lock);
	/*
	 * memory unplug did clear the node from the memtype and
	 * dax/kem did initialize this node's memory type.
	 */
	if (!node_isset(node, memtype->nodes) && node_memory_types[node]  == memtype){
		node_memory_types[node] = NULL;
	}
	mutex_unlock(&memory_tier_lock);
}

With the module unload, it is kind of force removing the usage of the specific memtype.
Considering module unload will remove the usage of specific memtype from other parts
of the kernel and we already do all the required reset in memory hot unplug, do we
need to do the clear_node_memory_type above? 

-aneesh
Huang, Ying Aug. 1, 2022, 5:10 a.m. UTC | #5
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 8/1/22 7:36 AM, Huang, Ying wrote:
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>
>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> By default, all nodes are assigned to the default memory tier which
>>>>> is the memory tier designated for nodes with DRAM
>>>>>
>>>>> Set dax kmem device node's tier to slower memory tier by assigning
>>>>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>>>>> appears below the default memory tier in demotion order.
>>>>>
>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>> ---
>>>>>  drivers/dax/kmem.c           |  9 +++++++++
>>>>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>>>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>>>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>>>>> index a37622060fff..6b0d5de9a3e9 100644
>>>>> --- a/drivers/dax/kmem.c
>>>>> +++ b/drivers/dax/kmem.c
>>>>> @@ -11,6 +11,7 @@
>>>>>  #include <linux/fs.h>
>>>>>  #include <linux/mm.h>
>>>>>  #include <linux/mman.h>
>>>>> +#include <linux/memory-tiers.h>
>>>>>  #include "dax-private.h"
>>>>>  #include "bus.h"
>>>>>  
>>>>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>>>>  	struct resource *res[];
>>>>>  };
>>>>>  
>>>>> +static struct memory_dev_type default_pmem_type  = {
>>>>
>>>> Why is this named as default_pmem_type?  We will not change the memory
>>>> type of a node usually.
>>>>
>>>
>>> Any other suggestion? pmem_dev_type? 
>> 
>> Or dax_pmem_type?
>> 
>> DAX is used to enumerate the memory device.
>> 
>>>
>>>>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>>>>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>>>>> +	.nodes  = NODE_MASK_NONE,
>>>>> +};
>>>>> +
>>>>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>  {
>>>>>  	struct device *dev = &dev_dax->dev;
>>>>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>  		return -EINVAL;
>>>>>  	}
>>>>>  
>>>>> +	init_node_memory_type(numa_node, &default_pmem_type);
>>>>> +
>>>>
>>>> The memory hot-add below may fail.  So the error handling needs to be
>>>> added.
>>>>
>>>> And, it appears that the memory type and memory tier of a node may be
>>>> fully initialized here before NUMA hot-adding started.  So I suggest to
>>>> set node_memory_types[] here only.  And set memory_dev_type->nodes in
>>>> node hot-add callback.  I think there is the proper place to complete
>>>> the initialization.
>>>>
>>>> And, in theory dax/kmem.c can be unloaded.  So we need to clear
>>>> node_memory_types[] for nodes somewhere.
>>>>
>>>
>>> I guess by module exit we can be sure that all the memory managed
>>> by dax/kmem is hotplugged out. How about something like below?
>> 
>> Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's
>> natural to clear it in dev_dax_kmem_remove().
>> 
>
> Most of required reset/clear is done as part of memory hotunplug. So
> if we did manage to successfully unplug the memory, everything except
> node_memory_types[node] should be reset. That makes the clear_node_memory_type
> the below. 
>
> void clear_node_memory_type(int node, struct memory_dev_type *memtype)
> {
>
> 	mutex_lock(&memory_tier_lock);
> 	/*
> 	 * memory unplug did clear the node from the memtype and
> 	 * dax/kem did initialize this node's memory type.
> 	 */
> 	if (!node_isset(node, memtype->nodes) && node_memory_types[node]  == memtype){
> 		node_memory_types[node] = NULL;
> 	}
> 	mutex_unlock(&memory_tier_lock);
> }
>
> With the module unload, it is kind of force removing the usage of the specific memtype.
> Considering module unload will remove the usage of specific memtype from other parts
> of the kernel and we already do all the required reset in memory hot unplug, do we
> need to do the clear_node_memory_type above? 

Per my understanding, we need to call clear_node_memory_type() in
dev_dax_kmem_remove().  After that, we have nothing to do in
dax_kmem_exit().

Best Regards,
Huang, Ying
Aneesh Kumar K.V Aug. 1, 2022, 5:38 a.m. UTC | #6
On 8/1/22 10:40 AM, Huang, Ying wrote:
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> 
>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>
>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>
>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>
>>>>>> By default, all nodes are assigned to the default memory tier which
>>>>>> is the memory tier designated for nodes with DRAM
>>>>>>
>>>>>> Set dax kmem device node's tier to slower memory tier by assigning
>>>>>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>>>>>> appears below the default memory tier in demotion order.
>>>>>>
>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>> ---
>>>>>>  drivers/dax/kmem.c           |  9 +++++++++
>>>>>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>>>>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>>>>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>>>>>> index a37622060fff..6b0d5de9a3e9 100644
>>>>>> --- a/drivers/dax/kmem.c
>>>>>> +++ b/drivers/dax/kmem.c
>>>>>> @@ -11,6 +11,7 @@
>>>>>>  #include <linux/fs.h>
>>>>>>  #include <linux/mm.h>
>>>>>>  #include <linux/mman.h>
>>>>>> +#include <linux/memory-tiers.h>
>>>>>>  #include "dax-private.h"
>>>>>>  #include "bus.h"
>>>>>>  
>>>>>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>>>>>  	struct resource *res[];
>>>>>>  };
>>>>>>  
>>>>>> +static struct memory_dev_type default_pmem_type  = {
>>>>>
>>>>> Why is this named as default_pmem_type?  We will not change the memory
>>>>> type of a node usually.
>>>>>
>>>>
>>>> Any other suggestion? pmem_dev_type? 
>>>
>>> Or dax_pmem_type?
>>>
>>> DAX is used to enumerate the memory device.
>>>
>>>>
>>>>>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>>>>>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>>>>>> +	.nodes  = NODE_MASK_NONE,
>>>>>> +};
>>>>>> +
>>>>>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>>  {
>>>>>>  	struct device *dev = &dev_dax->dev;
>>>>>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>>  		return -EINVAL;
>>>>>>  	}
>>>>>>  
>>>>>> +	init_node_memory_type(numa_node, &default_pmem_type);
>>>>>> +
>>>>>
>>>>> The memory hot-add below may fail.  So the error handling needs to be
>>>>> added.
>>>>>
>>>>> And, it appears that the memory type and memory tier of a node may be
>>>>> fully initialized here before NUMA hot-adding started.  So I suggest to
>>>>> set node_memory_types[] here only.  And set memory_dev_type->nodes in
>>>>> node hot-add callback.  I think there is the proper place to complete
>>>>> the initialization.
>>>>>
>>>>> And, in theory dax/kmem.c can be unloaded.  So we need to clear
>>>>> node_memory_types[] for nodes somewhere.
>>>>>
>>>>
>>>> I guess by module exit we can be sure that all the memory managed
>>>> by dax/kmem is hotplugged out. How about something like below?
>>>
>>> Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's
>>> natural to clear it in dev_dax_kmem_remove().
>>>
>>
>> Most of required reset/clear is done as part of memory hotunplug. So
>> if we did manage to successfully unplug the memory, everything except
>> node_memory_types[node] should be reset. That makes the clear_node_memory_type
>> the below. 
>>
>> void clear_node_memory_type(int node, struct memory_dev_type *memtype)
>> {
>>
>> 	mutex_lock(&memory_tier_lock);
>> 	/*
>> 	 * memory unplug did clear the node from the memtype and
>> 	 * dax/kem did initialize this node's memory type.
>> 	 */
>> 	if (!node_isset(node, memtype->nodes) && node_memory_types[node]  == memtype){
>> 		node_memory_types[node] = NULL;
>> 	}
>> 	mutex_unlock(&memory_tier_lock);
>> }
>>
>> With the module unload, it is kind of force removing the usage of the specific memtype.
>> Considering module unload will remove the usage of specific memtype from other parts
>> of the kernel and we already do all the required reset in memory hot unplug, do we
>> need to do the clear_node_memory_type above? 
> 
> Per my understanding, we need to call clear_node_memory_type() in
> dev_dax_kmem_remove().  After that, we have nothing to do in
> dax_kmem_exit().
> 

Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 
Should we also rebuild demotion order? On a successful memory remove we do rebuild demotion order.
This is what i ended up with.

modified   drivers/dax/kmem.c
@@ -171,6 +171,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 {
 	int i, success = 0;
+	int node = dev_dax->target_node;
 	struct device *dev = &dev_dax->dev;
 	struct dax_kmem_data *data = dev_get_drvdata(dev);
 
@@ -208,6 +209,12 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 		kfree(data);
 		dev_set_drvdata(dev, NULL);
 	}
+	/*
+	 * Clear the memtype association, even if the memory
+	 * remove failed.
+	 */
+	clear_node_memory_type(node, dax_pmem_type);
+
 }
 #else
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
modified   include/linux/memory-tiers.h
@@ -31,6 +31,7 @@ struct memory_dev_type {
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
+void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node)
 #define numa_demotion_enabled	false
 static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
 {
+}
+
+static inline void unregister_memory_type(struct memory_dev_type *memtype)
+{
 
 }
 
modified   mm/memory-tiers.c
@@ -501,6 +501,36 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type)
 }
 EXPORT_SYMBOL_GPL(init_node_memory_type);
 
+void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+	struct memory_tier *memtier;
+
+	mutex_lock(&memory_tier_lock);
+	/*
+	 * Even if we fail to unplug memory, clear the association of
+	 * this node to this specific memory type.
+	 */
+	if (node_memory_types[node] == memtype) {
+
+		memtier = __node_get_memory_tier(node);
+		if (memtier) {
+			rcu_assign_pointer(pgdat->memtier, NULL);
+			synchronize_rcu();
+		}
+		node_clear(node, memtype->nodes);
+		if (nodes_empty(memtype->nodes)) {
+			list_del(&memtype->tier_sibiling);
+			memtype->memtier = NULL;
+			if (current_memtier && list_empty(&current_memtier->memory_types))
+				destroy_memory_tier(current_memtier);
+
+		}
+		node_memory_types[node] = NULL;
+	}
+	mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(init_node_memory_type);
+
 void update_node_adistance(int node, struct memory_dev_type *memtype)
 {
 	pg_data_t *pgdat;

[back
Huang, Ying Aug. 1, 2022, 6:37 a.m. UTC | #7
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 8/1/22 10:40 AM, Huang, Ying wrote:
>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>
>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>>
>>>>>>> By default, all nodes are assigned to the default memory tier which
>>>>>>> is the memory tier designated for nodes with DRAM
>>>>>>>
>>>>>>> Set dax kmem device node's tier to slower memory tier by assigning
>>>>>>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier
>>>>>>> appears below the default memory tier in demotion order.
>>>>>>>
>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>> ---
>>>>>>>  drivers/dax/kmem.c           |  9 +++++++++
>>>>>>>  include/linux/memory-tiers.h | 19 ++++++++++++++++++-
>>>>>>>  mm/memory-tiers.c            | 28 ++++++++++++++++------------
>>>>>>>  3 files changed, 43 insertions(+), 13 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>>>>>>> index a37622060fff..6b0d5de9a3e9 100644
>>>>>>> --- a/drivers/dax/kmem.c
>>>>>>> +++ b/drivers/dax/kmem.c
>>>>>>> @@ -11,6 +11,7 @@
>>>>>>>  #include <linux/fs.h>
>>>>>>>  #include <linux/mm.h>
>>>>>>>  #include <linux/mman.h>
>>>>>>> +#include <linux/memory-tiers.h>
>>>>>>>  #include "dax-private.h"
>>>>>>>  #include "bus.h"
>>>>>>>  
>>>>>>> @@ -41,6 +42,12 @@ struct dax_kmem_data {
>>>>>>>  	struct resource *res[];
>>>>>>>  };
>>>>>>>  
>>>>>>> +static struct memory_dev_type default_pmem_type  = {
>>>>>>
>>>>>> Why is this named as default_pmem_type?  We will not change the memory
>>>>>> type of a node usually.
>>>>>>
>>>>>
>>>>> Any other suggestion? pmem_dev_type? 
>>>>
>>>> Or dax_pmem_type?
>>>>
>>>> DAX is used to enumerate the memory device.
>>>>
>>>>>
>>>>>>> +	.adistance = MEMTIER_ADISTANCE_PMEM,
>>>>>>> +	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
>>>>>>> +	.nodes  = NODE_MASK_NONE,
>>>>>>> +};
>>>>>>> +
>>>>>>>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>>>  {
>>>>>>>  	struct device *dev = &dev_dax->dev;
>>>>>>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>>>>>>>  		return -EINVAL;
>>>>>>>  	}
>>>>>>>  
>>>>>>> +	init_node_memory_type(numa_node, &default_pmem_type);
>>>>>>> +
>>>>>>
>>>>>> The memory hot-add below may fail.  So the error handling needs to be
>>>>>> added.
>>>>>>
>>>>>> And, it appears that the memory type and memory tier of a node may be
>>>>>> fully initialized here before NUMA hot-adding started.  So I suggest to
>>>>>> set node_memory_types[] here only.  And set memory_dev_type->nodes in
>>>>>> node hot-add callback.  I think there is the proper place to complete
>>>>>> the initialization.
>>>>>>
>>>>>> And, in theory dax/kmem.c can be unloaded.  So we need to clear
>>>>>> node_memory_types[] for nodes somewhere.
>>>>>>
>>>>>
>>>>> I guess by module exit we can be sure that all the memory managed
>>>>> by dax/kmem is hotplugged out. How about something like below?
>>>>
>>>> Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's
>>>> natural to clear it in dev_dax_kmem_remove().
>>>>
>>>
>>> Most of required reset/clear is done as part of memory hotunplug. So
>>> if we did manage to successfully unplug the memory, everything except
>>> node_memory_types[node] should be reset. That makes the clear_node_memory_type
>>> the below. 
>>>
>>> void clear_node_memory_type(int node, struct memory_dev_type *memtype)
>>> {
>>>
>>> 	mutex_lock(&memory_tier_lock);
>>> 	/*
>>> 	 * memory unplug did clear the node from the memtype and
>>> 	 * dax/kem did initialize this node's memory type.
>>> 	 */
>>> 	if (!node_isset(node, memtype->nodes) && node_memory_types[node]  == memtype){
>>> 		node_memory_types[node] = NULL;
>>> 	}
>>> 	mutex_unlock(&memory_tier_lock);
>>> }
>>>
>>> With the module unload, it is kind of force removing the usage of the specific memtype.
>>> Considering module unload will remove the usage of specific memtype from other parts
>>> of the kernel and we already do all the required reset in memory hot unplug, do we
>>> need to do the clear_node_memory_type above? 
>> 
>> Per my understanding, we need to call clear_node_memory_type() in
>> dev_dax_kmem_remove().  After that, we have nothing to do in
>> dax_kmem_exit().
>> 
>
> Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 

Can we use node_memory_types[] to indicate whether a node is managed by
a driver?

Regardless being succeeded or failed, dev_dax_kmem_remove() will set
node_memory_types[] = NULL.  But until node is offlined, we will still
keep the node in the memory_dev_type (dax_pmem_type).

And we will prevent dax/kmem from unloading via try_module_get() and add
"struct module *" to struct memory_dev_type.

Best Regards,
Huang, Ying

> Should we also rebuild demotion order? On a successful memory remove we do rebuild demotion order.
> This is what i ended up with.
>
> modified   drivers/dax/kmem.c
> @@ -171,6 +171,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>  static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
>  {
>  	int i, success = 0;
> +	int node = dev_dax->target_node;
>  	struct device *dev = &dev_dax->dev;
>  	struct dax_kmem_data *data = dev_get_drvdata(dev);
>  
> @@ -208,6 +209,12 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
>  		kfree(data);
>  		dev_set_drvdata(dev, NULL);
>  	}
> +	/*
> +	 * Clear the memtype association, even if the memory
> +	 * remove failed.
> +	 */
> +	clear_node_memory_type(node, dax_pmem_type);
> +
>  }
>  #else
>  static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
> modified   include/linux/memory-tiers.h
> @@ -31,6 +31,7 @@ struct memory_dev_type {
>  #ifdef CONFIG_NUMA
>  extern bool numa_demotion_enabled;
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
> +void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> @@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node)
>  #define numa_demotion_enabled	false
>  static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
>  {
> +}
> +
> +static inline void unregister_memory_type(struct memory_dev_type *memtype)
> +{
>  
>  }
>  
> modified   mm/memory-tiers.c
> @@ -501,6 +501,36 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type)
>  }
>  EXPORT_SYMBOL_GPL(init_node_memory_type);
>  
> +void clear_node_memory_type(int node, struct memory_dev_type *memtype)
> +{
> +	struct memory_tier *memtier;
> +
> +	mutex_lock(&memory_tier_lock);
> +	/*
> +	 * Even if we fail to unplug memory, clear the association of
> +	 * this node to this specific memory type.
> +	 */
> +	if (node_memory_types[node] == memtype) {
> +
> +		memtier = __node_get_memory_tier(node);
> +		if (memtier) {
> +			rcu_assign_pointer(pgdat->memtier, NULL);
> +			synchronize_rcu();
> +		}
> +		node_clear(node, memtype->nodes);
> +		if (nodes_empty(memtype->nodes)) {
> +			list_del(&memtype->tier_sibiling);
> +			memtype->memtier = NULL;
> +			if (current_memtier && list_empty(&current_memtier->memory_types))
> +				destroy_memory_tier(current_memtier);
> +
> +		}
> +		node_memory_types[node] = NULL;
> +	}
> +	mutex_unlock(&memory_tier_lock);
> +}
> +EXPORT_SYMBOL_GPL(init_node_memory_type);
> +
>  void update_node_adistance(int node, struct memory_dev_type *memtype)
>  {
>  	pg_data_t *pgdat;
>
> [back
Aneesh Kumar K.V Aug. 1, 2022, 6:55 a.m. UTC | #8
On 8/1/22 12:07 PM, Huang, Ying wrote:
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> 
>> On 8/1/22 10:40 AM, Huang, Ying wrote:
>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>
>>>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>
>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>
>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

....

>>>>
>>>> With the module unload, it is kind of force removing the usage of the specific memtype.
>>>> Considering module unload will remove the usage of specific memtype from other parts
>>>> of the kernel and we already do all the required reset in memory hot unplug, do we
>>>> need to do the clear_node_memory_type above? 
>>>
>>> Per my understanding, we need to call clear_node_memory_type() in
>>> dev_dax_kmem_remove().  After that, we have nothing to do in
>>> dax_kmem_exit().
>>>
>>
>> Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 
> 
> Can we use node_memory_types[] to indicate whether a node is managed by
> a driver?
> 
> Regardless being succeeded or failed, dev_dax_kmem_remove() will set
> node_memory_types[] = NULL.  But until node is offlined, we will still
> keep the node in the memory_dev_type (dax_pmem_type).
> 
> And we will prevent dax/kmem from unloading via try_module_get() and add
> "struct module *" to struct memory_dev_type.
> 

Current dax/kmem driver is not holding any module reference and allows the module to be unloaded
anytime. Even if the memory onlined by the driver fails to be unplugged. Addition of memory_dev_type
as suggested by you will be different than that. Page demotion can continue to work without the
support of dax_pmem_type as long as we keep the older demotion order. Any new demotion order
rebuild will remove the the memory node which was not hotunplugged  from the demotion order. Isn't that
a much simpler implementation? 

-aneesh
Huang, Ying Aug. 1, 2022, 7:13 a.m. UTC | #9
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 8/1/22 12:07 PM, Huang, Ying wrote:
>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> On 8/1/22 10:40 AM, Huang, Ying wrote:
>>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>>
>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>
>>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>
> ....
>
>>>>>
>>>>> With the module unload, it is kind of force removing the usage of the specific memtype.
>>>>> Considering module unload will remove the usage of specific memtype from other parts
>>>>> of the kernel and we already do all the required reset in memory hot unplug, do we
>>>>> need to do the clear_node_memory_type above? 
>>>>
>>>> Per my understanding, we need to call clear_node_memory_type() in
>>>> dev_dax_kmem_remove().  After that, we have nothing to do in
>>>> dax_kmem_exit().
>>>>
>>>
>>> Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 
>> 
>> Can we use node_memory_types[] to indicate whether a node is managed by
>> a driver?
>> 
>> Regardless being succeeded or failed, dev_dax_kmem_remove() will set
>> node_memory_types[] = NULL.  But until node is offlined, we will still
>> keep the node in the memory_dev_type (dax_pmem_type).
>> 
>> And we will prevent dax/kmem from unloading via try_module_get() and add
>> "struct module *" to struct memory_dev_type.
>> 
>
> Current dax/kmem driver is not holding any module reference and allows the module to be unloaded
> anytime. Even if the memory onlined by the driver fails to be unplugged. Addition of memory_dev_type
> as suggested by you will be different than that. Page demotion can continue to work without the
> support of dax_pmem_type as long as we keep the older demotion order. Any new demotion order
> rebuild will remove the the memory node which was not hotunplugged  from the demotion order. Isn't that
> a much simpler implementation? 

Per my understanding, unbinding/binding the dax/kmem driver means
changing the memory type of a memory device.  For example, unbinding
dax/kmem driver may mean changing the memory type from dax_pmem_type to
default_memory_type (or default_dram_type).  That appears strange.  But
if we force the NUMA node to be offlined for unbinding, we can avoid to
change the memory type to default_memory_type.

Best Regards,
Huang, Ying
Aneesh Kumar K.V Aug. 1, 2022, 7:41 a.m. UTC | #10
On 8/1/22 12:43 PM, Huang, Ying wrote:
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> 
>> On 8/1/22 12:07 PM, Huang, Ying wrote:
>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>
>>>> On 8/1/22 10:40 AM, Huang, Ying wrote:
>>>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>>>
>>>>>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>>>
>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>
>>>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>
>> ....
>>
>>>>>>
>>>>>> With the module unload, it is kind of force removing the usage of the specific memtype.
>>>>>> Considering module unload will remove the usage of specific memtype from other parts
>>>>>> of the kernel and we already do all the required reset in memory hot unplug, do we
>>>>>> need to do the clear_node_memory_type above? 
>>>>>
>>>>> Per my understanding, we need to call clear_node_memory_type() in
>>>>> dev_dax_kmem_remove().  After that, we have nothing to do in
>>>>> dax_kmem_exit().
>>>>>
>>>>
>>>> Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 
>>>
>>> Can we use node_memory_types[] to indicate whether a node is managed by
>>> a driver?
>>>
>>> Regardless being succeeded or failed, dev_dax_kmem_remove() will set
>>> node_memory_types[] = NULL.  But until node is offlined, we will still
>>> keep the node in the memory_dev_type (dax_pmem_type).
>>>
>>> And we will prevent dax/kmem from unloading via try_module_get() and add
>>> "struct module *" to struct memory_dev_type.
>>>
>>
>> Current dax/kmem driver is not holding any module reference and allows the module to be unloaded
>> anytime. Even if the memory onlined by the driver fails to be unplugged. Addition of memory_dev_type
>> as suggested by you will be different than that. Page demotion can continue to work without the
>> support of dax_pmem_type as long as we keep the older demotion order. Any new demotion order
>> rebuild will remove the the memory node which was not hotunplugged  from the demotion order. Isn't that
>> a much simpler implementation? 
> 
> Per my understanding, unbinding/binding the dax/kmem driver means
> changing the memory type of a memory device.  For example, unbinding
> dax/kmem driver may mean changing the memory type from dax_pmem_type to
> default_memory_type (or default_dram_type).  That appears strange.  But
> if we force the NUMA node to be offlined for unbinding, we can avoid to
> change the memory type to default_memory_type.
> 

If we are able to unplug all the memory, we do remove the node from N_MEMORY.
If we fail to unplug the memory, we have two options. 

1) Keep the same demotion order
2) Rebuild the demotion order which results in memory NUMA node not participating
   in demotion. 

I agree with you that we should not switch to default memory type. 

The below code demonstrate how it can be done. If we want to keep
the same demotion order, we can remove establish_demotion_target() from
the below code. 

void clear_node_memory_type(int node, struct memory_dev_type *memtype)
{
	struct memory_tier *memtier;
	pg_data_t *pgdat = NODE_DATA(node);

	mutex_lock(&memory_tier_lock);
	/*
	 * Even if we fail to unplug memory, clear the association of
	 * this node to this specific memory type.
	 */
	if (node_isset(node, memtype->nodes) && node_memory_types[node] == memtype) {

		memtier = __node_get_memory_tier(node);
		if (memtier) {
			rcu_assign_pointer(pgdat->memtier, NULL);
			synchronize_rcu();
		}
		node_clear(node, memtype->nodes);
		if (nodes_empty(memtype->nodes)) {
			list_del(&memtype->tier_sibiling);
			memtype->memtier = NULL;
			if (memtier && list_empty(&memtier->memory_types))
				destroy_memory_tier(memtier);

		}
		establish_demotion_targets();
	}
	node_memory_types[node] = NULL;
	mutex_unlock(&memory_tier_lock);
}


If we agree that we want to keep the current behavior (that is to allow kmem driver unload
even on memory unplug failure) we can go with the above change. If we are suggesting we
should prevent a driver unload, then IMHO it should be independent of memory_dev_type
(or this patch series). We should make sure we take a module reference on successful
memory online and drop it only on successful offline. 


-aneesh
Huang, Ying Aug. 2, 2022, 1:58 a.m. UTC | #11
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 8/1/22 12:43 PM, Huang, Ying wrote:
>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> On 8/1/22 12:07 PM, Huang, Ying wrote:
>>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> On 8/1/22 10:40 AM, Huang, Ying wrote:
>>>>>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>>>>>
>>>>>>> On 8/1/22 7:36 AM, Huang, Ying wrote:
>>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>>>>>
>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>
>>>>>>>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>
>>> ....
>>>
>>>>>>>
>>>>>>> With the module unload, it is kind of force removing the usage of the specific memtype.
>>>>>>> Considering module unload will remove the usage of specific memtype from other parts
>>>>>>> of the kernel and we already do all the required reset in memory hot unplug, do we
>>>>>>> need to do the clear_node_memory_type above? 
>>>>>>
>>>>>> Per my understanding, we need to call clear_node_memory_type() in
>>>>>> dev_dax_kmem_remove().  After that, we have nothing to do in
>>>>>> dax_kmem_exit().
>>>>>>
>>>>>
>>>>> Ok, I guess you are suggesting to do the clear_node_memory_type even if we fail the memory remove. 
>>>>
>>>> Can we use node_memory_types[] to indicate whether a node is managed by
>>>> a driver?
>>>>
>>>> Regardless being succeeded or failed, dev_dax_kmem_remove() will set
>>>> node_memory_types[] = NULL.  But until node is offlined, we will still
>>>> keep the node in the memory_dev_type (dax_pmem_type).
>>>>
>>>> And we will prevent dax/kmem from unloading via try_module_get() and add
>>>> "struct module *" to struct memory_dev_type.
>>>>
>>>
>>> Current dax/kmem driver is not holding any module reference and allows the module to be unloaded
>>> anytime. Even if the memory onlined by the driver fails to be unplugged. Addition of memory_dev_type
>>> as suggested by you will be different than that. Page demotion can continue to work without the
>>> support of dax_pmem_type as long as we keep the older demotion order. Any new demotion order
>>> rebuild will remove the the memory node which was not hotunplugged  from the demotion order. Isn't that
>>> a much simpler implementation? 
>> 
>> Per my understanding, unbinding/binding the dax/kmem driver means
>> changing the memory type of a memory device.  For example, unbinding
>> dax/kmem driver may mean changing the memory type from dax_pmem_type to
>> default_memory_type (or default_dram_type).  That appears strange.  But
>> if we force the NUMA node to be offlined for unbinding, we can avoid to
>> change the memory type to default_memory_type.
>> 
>
> If we are able to unplug all the memory, we do remove the node from N_MEMORY.
> If we fail to unplug the memory, we have two options. 
>
> 1) Keep the same demotion order
> 2) Rebuild the demotion order which results in memory NUMA node not participating
>    in demotion. 
>
> I agree with you that we should not switch to default memory type. 
>
> The below code demonstrate how it can be done. If we want to keep
> the same demotion order, we can remove establish_demotion_target() from
> the below code. 
>
> void clear_node_memory_type(int node, struct memory_dev_type *memtype)
> {
> 	struct memory_tier *memtier;
> 	pg_data_t *pgdat = NODE_DATA(node);
>
> 	mutex_lock(&memory_tier_lock);
> 	/*
> 	 * Even if we fail to unplug memory, clear the association of
> 	 * this node to this specific memory type.
> 	 */
> 	if (node_isset(node, memtype->nodes) && node_memory_types[node] == memtype) {
>
> 		memtier = __node_get_memory_tier(node);
> 		if (memtier) {
> 			rcu_assign_pointer(pgdat->memtier, NULL);
> 			synchronize_rcu();
> 		}
> 		node_clear(node, memtype->nodes);
> 		if (nodes_empty(memtype->nodes)) {
> 			list_del(&memtype->tier_sibiling);
> 			memtype->memtier = NULL;
> 			if (memtier && list_empty(&memtier->memory_types))
> 				destroy_memory_tier(memtier);
>
> 		}
> 		establish_demotion_targets();
> 	}
> 	node_memory_types[node] = NULL;
> 	mutex_unlock(&memory_tier_lock);
> }
>
>
> If we agree that we want to keep the current behavior (that is to allow kmem driver unload
> even on memory unplug failure) we can go with the above change. If we are suggesting we
> should prevent a driver unload, then IMHO it should be independent of memory_dev_type
> (or this patch series). We should make sure we take a module reference on successful
> memory online and drop it only on successful offline. 

I suggest to keep a NUMA node in the memory_dev_type (dax_pmem_type)
until the node is offlined.

Yes.  The dax/kmem driver may be unbound to the dax device even if
memory offlining fails.  But we can still find someway to keep
the memory_dev_type of the NUMA node unchanged.

Solution 1 is to prevent dax/kmem driver from unloading via module
reference.  I think we do that in this series.

Solution 2 is to allocate dax_pmem_type dynamically, and manage it like
"kmem_name".  We may need some kind of reference counting to manage it.

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index a37622060fff..6b0d5de9a3e9 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -11,6 +11,7 @@ 
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/memory-tiers.h>
 #include "dax-private.h"
 #include "bus.h"
 
@@ -41,6 +42,12 @@  struct dax_kmem_data {
 	struct resource *res[];
 };
 
+static struct memory_dev_type default_pmem_type  = {
+	.adistance = MEMTIER_ADISTANCE_PMEM,
+	.tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling),
+	.nodes  = NODE_MASK_NONE,
+};
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
 	struct device *dev = &dev_dax->dev;
@@ -62,6 +69,8 @@  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 		return -EINVAL;
 	}
 
+	init_node_memory_type(numa_node, &default_pmem_type);
+
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct range range;
 
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 976f43a5e3be..4f4baf0bf430 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,8 @@ 
 #ifndef _LINUX_MEMORY_TIERS_H
 #define _LINUX_MEMORY_TIERS_H
 
+#include <linux/types.h>
+#include <linux/nodemask.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -15,12 +17,27 @@ 
 #define MEMTIER_ADISTANCE_PMEM	(1 << MEMTIER_CHUNK_BITS)
 #define MEMTIER_HOTPLUG_PRIO	100
 
+struct memory_tier;
+struct memory_dev_type {
+	/* list of memory types that are are part of same tier as this type */
+	struct list_head tier_sibiling;
+	/* abstract distance for this specific memory type */
+	int adistance;
+	/* Nodes of same abstract distance */
+	nodemask_t nodes;
+	struct memory_tier *memtier;
+};
+
 #ifdef CONFIG_NUMA
-#include <linux/types.h>
 extern bool numa_demotion_enabled;
+struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type);
 
 #else
 
 #define numa_demotion_enabled	false
+static inline struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type)
+{
+	return ERR_PTR(-EINVAL);
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c9854a394d9b..109be75fa554 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -1,6 +1,4 @@ 
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-#include <linux/nodemask.h>
 #include <linux/slab.h>
 #include <linux/lockdep.h>
 #include <linux/memory.h>
@@ -19,16 +17,6 @@  struct memory_tier {
 	int adistance_start;
 };
 
-struct memory_dev_type {
-	/* list of memory types that are are part of same tier as this type */
-	struct list_head tier_sibiling;
-	/* abstract distance for this specific memory type */
-	int adistance;
-	/* Nodes of same abstract distance */
-	nodemask_t nodes;
-	struct memory_tier *memtier;
-};
-
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
 struct memory_dev_type *node_memory_types[MAX_NUMNODES];
@@ -141,6 +129,22 @@  static void clear_node_memory_tier(int node)
 	mutex_unlock(&memory_tier_lock);
 }
 
+struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type *default_type)
+{
+	struct memory_dev_type *mem_type;
+
+	mutex_lock(&memory_tier_lock);
+	if (node_memory_types[node]) {
+		mem_type = node_memory_types[node];
+	} else {
+		node_memory_types[node] = default_type;
+		node_set(node, default_type->nodes);
+		mem_type = default_type;
+	}
+	mutex_unlock(&memory_tier_lock);
+	return mem_type;
+}
+
 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
 					      unsigned long action, void *_arg)
 {