diff mbox series

[v3,4/7] mm/hotplug: Allow pageblock alignment via altmap reservation

Message ID 20230711044834.72809-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series Add support for memmap on memory feature on ppc64 | expand

Commit Message

Aneesh Kumar K.V July 11, 2023, 4:48 a.m. UTC
Add a new kconfig option that can be selected if we want to allow
pageblock alignment by reserving pages in the vmemmap altmap area.
This implies we will be reserving some pages for every memoryblock
This also allows the memmap on memory feature to be widely useful
with different memory block size values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/Kconfig          |  9 +++++++
 mm/memory_hotplug.c | 59 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 58 insertions(+), 10 deletions(-)

Comments

Huang, Ying July 11, 2023, 6:21 a.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> Add a new kconfig option that can be selected if we want to allow
> pageblock alignment by reserving pages in the vmemmap altmap area.
> This implies we will be reserving some pages for every memoryblock
> This also allows the memmap on memory feature to be widely useful
> with different memory block size values.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/Kconfig          |  9 +++++++
>  mm/memory_hotplug.c | 59 +++++++++++++++++++++++++++++++++++++--------
>  2 files changed, 58 insertions(+), 10 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 932349271e28..88a1472b2086 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -570,6 +570,15 @@ config MHP_MEMMAP_ON_MEMORY
>  	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
>  	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>  
> +config MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY
> +       bool "Allow Reserving pages for page block aligment"
> +       depends on MHP_MEMMAP_ON_MEMORY
> +       help
> +	This option allows memmap on memory feature to be more useful
> +	with different memory block sizes. This is achieved by marking some pages
> +	in each memory block as reserved so that we can get page-block alignment
> +	for the remaining pages.
> +
>  endif # MEMORY_HOTPLUG
>  
>  config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 07c99b0cc371..f36aec1f7626 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1252,15 +1252,17 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
>  {
>  	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>  	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
> -	unsigned long remaining_size = size - vmemmap_size;
>  
> -	return IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
> -		IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
> +	return IS_ALIGNED(vmemmap_size, PMD_SIZE);
>  }
>  #endif
>  
>  static bool mhp_supports_memmap_on_memory(unsigned long size)
>  {
> +	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
> +	unsigned long remaining_size = size - vmemmap_size;
> +
>  	/*
>  	 * Besides having arch support and the feature enabled at runtime, we
>  	 * need a few more assumptions to hold true:
> @@ -1287,9 +1289,30 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>  	 *       altmap as an alternative source of memory, and we do not exactly
>  	 *       populate a single PMD.
>  	 */
> -	return mhp_memmap_on_memory() &&
> -		size == memory_block_size_bytes() &&
> -		arch_supports_memmap_on_memory(size);
> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
> +		return false;
> +	 /*
> +	  * Without page reservation remaining pages should be pageblock aligned.
> +	  */
> +	if (!IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
> +	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
> +		return false;
> +
> +	return arch_supports_memmap_on_memory(size);
> +}
> +
> +static inline unsigned long memory_block_align_base(unsigned long size)
> +{
> +	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY)) {
> +		unsigned long align;
> +		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +		unsigned long vmemmap_size;
> +
> +		vmemmap_size = (nr_vmemmap_pages * sizeof(struct page)) >> PAGE_SHIFT;

DIV_ROUND_UP()?

> +		align = pageblock_align(vmemmap_size) - vmemmap_size;
> +		return align;
> +	} else
> +		return 0;
>  }
>  
>  /*
> @@ -1302,7 +1325,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>  {
>  	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>  	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> -	struct vmem_altmap mhp_altmap = {};
> +	struct vmem_altmap mhp_altmap = {
> +		.base_pfn =  PHYS_PFN(res->start),
> +		.end_pfn  =  PHYS_PFN(res->end),
> +		.reserve  = memory_block_align_base(resource_size(res)),
> +	};
>  	struct memory_group *group = NULL;
>  	u64 start, size;
>  	bool new_node = false;
> @@ -1347,8 +1374,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>  	 */
>  	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
>  		if (mhp_supports_memmap_on_memory(size)) {
> -			mhp_altmap.free = PHYS_PFN(size);
> -			mhp_altmap.base_pfn = PHYS_PFN(start);
> +			mhp_altmap.free = PHYS_PFN(size) - mhp_altmap.reserve;
>  			params.altmap = &mhp_altmap;
>  		}
>  		/* fallback to not using altmap  */
> @@ -1360,7 +1386,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>  		goto error;
>  
>  	/* create memory block devices after memory was added */
> -	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
> +	ret = create_memory_block_devices(start, size, mhp_altmap.alloc + mhp_altmap.reserve,
>  					  group);
>  	if (ret) {
>  		arch_remove_memory(start, size, NULL);
> @@ -2260,3 +2286,16 @@ int offline_and_remove_memory(u64 start, u64 size)
>  }
>  EXPORT_SYMBOL_GPL(offline_and_remove_memory);
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
> +
> +static int __init memory_hotplug_init(void)
> +{
> +
> +	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
> +	    mhp_memmap_on_memory()) {
> +		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
> +			memory_block_align_base(memory_block_size_bytes()));
> +
> +	}
> +	return 0;
> +}
> +module_init(memory_hotplug_init);
Aneesh Kumar K.V July 11, 2023, 8:20 a.m. UTC | #2
On 7/11/23 11:51 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> 
>> Add a new kconfig option that can be selected if we want to allow
>> pageblock alignment by reserving pages in the vmemmap altmap area.
>> This implies we will be reserving some pages for every memoryblock
>> This also allows the memmap on memory feature to be widely useful
>> with different memory block size values.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  mm/Kconfig          |  9 +++++++
>>  mm/memory_hotplug.c | 59 +++++++++++++++++++++++++++++++++++++--------
>>  2 files changed, 58 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 932349271e28..88a1472b2086 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -570,6 +570,15 @@ config MHP_MEMMAP_ON_MEMORY
>>  	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
>>  	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>>  
>> +config MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY
>> +       bool "Allow Reserving pages for page block aligment"
>> +       depends on MHP_MEMMAP_ON_MEMORY
>> +       help
>> +	This option allows memmap on memory feature to be more useful
>> +	with different memory block sizes. This is achieved by marking some pages
>> +	in each memory block as reserved so that we can get page-block alignment
>> +	for the remaining pages.
>> +
>>  endif # MEMORY_HOTPLUG
>>  
>>  config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 07c99b0cc371..f36aec1f7626 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1252,15 +1252,17 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
>>  {
>>  	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>>  	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>> -	unsigned long remaining_size = size - vmemmap_size;
>>  
>> -	return IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
>> -		IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
>> +	return IS_ALIGNED(vmemmap_size, PMD_SIZE);
>>  }
>>  #endif
>>  
>>  static bool mhp_supports_memmap_on_memory(unsigned long size)
>>  {
>> +	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>> +	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>> +	unsigned long remaining_size = size - vmemmap_size;
>> +
>>  	/*
>>  	 * Besides having arch support and the feature enabled at runtime, we
>>  	 * need a few more assumptions to hold true:
>> @@ -1287,9 +1289,30 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>>  	 *       altmap as an alternative source of memory, and we do not exactly
>>  	 *       populate a single PMD.
>>  	 */
>> -	return mhp_memmap_on_memory() &&
>> -		size == memory_block_size_bytes() &&
>> -		arch_supports_memmap_on_memory(size);
>> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
>> +		return false;
>> +	 /*
>> +	  * Without page reservation remaining pages should be pageblock aligned.
>> +	  */
>> +	if (!IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
>> +	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
>> +		return false;
>> +
>> +	return arch_supports_memmap_on_memory(size);
>> +}
>> +
>> +static inline unsigned long memory_block_align_base(unsigned long size)
>> +{
>> +	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY)) {
>> +		unsigned long align;
>> +		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>> +		unsigned long vmemmap_size;
>> +
>> +		vmemmap_size = (nr_vmemmap_pages * sizeof(struct page)) >> PAGE_SHIFT;
> 
> DIV_ROUND_UP()?
> 
>

yes. Will update.

		vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);

-aneesh
David Hildenbrand July 11, 2023, 5:19 p.m. UTC | #3
On 11.07.23 06:48, Aneesh Kumar K.V wrote:
> Add a new kconfig option that can be selected if we want to allow
> pageblock alignment by reserving pages in the vmemmap altmap area.
> This implies we will be reserving some pages for every memoryblock
> This also allows the memmap on memory feature to be widely useful
> with different memory block size values.

"reserving pages" is a nice way of saying "wasting memory". :) Let's 
spell that out.

I think we have to find a better name for this, and I think we should 
have a toggle similar to memory_hotplug.memmap_on_memory. This should be 
an admin decision, not some kernel config option.


memory_hotplug.force_memmap_on_memory

"Enable the memmap on memory feature even if it could result in memory 
waste due to memmap size limitations. For example, if the memmap for a 
memory block requires 1 MiB, but the pageblock size is 2 MiB, 1 MiB
of hotplugged memory will be wasted. Note that there are still cases 
where the feature cannot be enforced: for example, if the memmap is 
smaller than a single page, or if the architecture does not support the 
forced mode in all configurations."

Thoughts?

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   mm/Kconfig          |  9 +++++++
>   mm/memory_hotplug.c | 59 +++++++++++++++++++++++++++++++++++++--------
>   2 files changed, 58 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 932349271e28..88a1472b2086 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -570,6 +570,15 @@ config MHP_MEMMAP_ON_MEMORY
>   	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
>   	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>   
> +config MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY
> +       bool "Allow Reserving pages for page block aligment"
> +       depends on MHP_MEMMAP_ON_MEMORY
> +       help
> +	This option allows memmap on memory feature to be more useful
> +	with different memory block sizes. This is achieved by marking some pages
> +	in each memory block as reserved so that we can get page-block alignment
> +	for the remaining pages.
> +



>   endif # MEMORY_HOTPLUG
>   
>   config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 07c99b0cc371..f36aec1f7626 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1252,15 +1252,17 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
>   {
>   	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>   	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
> -	unsigned long remaining_size = size - vmemmap_size;
>   
> -	return IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
> -		IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
> +	return IS_ALIGNED(vmemmap_size, PMD_SIZE);
>   }
>   #endif
>   
>   static bool mhp_supports_memmap_on_memory(unsigned long size)
>   {
> +	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
> +	unsigned long remaining_size = size - vmemmap_size;
> +
>   	/*
>   	 * Besides having arch support and the feature enabled at runtime, we
>   	 * need a few more assumptions to hold true:
> @@ -1287,9 +1289,30 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>   	 *       altmap as an alternative source of memory, and we do not exactly
>   	 *       populate a single PMD.
>   	 */
> -	return mhp_memmap_on_memory() &&
> -		size == memory_block_size_bytes() &&
> -		arch_supports_memmap_on_memory(size);
> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
> +		return false;
> +	 /*
> +	  * Without page reservation remaining pages should be pageblock aligned.
> +	  */
> +	if (!IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
> +	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
> +		return false;
> +
> +	return arch_supports_memmap_on_memory(size);
> +}
> +
> +static inline unsigned long memory_block_align_base(unsigned long size)
> +{
> +	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY)) {
> +		unsigned long align;
> +		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +		unsigned long vmemmap_size;
> +
> +		vmemmap_size = (nr_vmemmap_pages * sizeof(struct page)) >> PAGE_SHIFT;
> +		align = pageblock_align(vmemmap_size) - vmemmap_size;

We should probably have a helper to calculate

a) the unaligned vmemmap size, for example used in 
arch_supports_memmap_on_memory()

b) the pageblock-aligned vmemmap size.
Aneesh Kumar K.V July 12, 2023, 3:16 a.m. UTC | #4
On 7/11/23 10:49 PM, David Hildenbrand wrote:
> On 11.07.23 06:48, Aneesh Kumar K.V wrote:
>> Add a new kconfig option that can be selected if we want to allow
>> pageblock alignment by reserving pages in the vmemmap altmap area.
>> This implies we will be reserving some pages for every memoryblock
>> This also allows the memmap on memory feature to be widely useful
>> with different memory block size values.
> 
> "reserving pages" is a nice way of saying "wasting memory". :) Let's spell that out.
> 
> I think we have to find a better name for this, and I think we should have a toggle similar to memory_hotplug.memmap_on_memory. This should be an admin decision, not some kernel config option.
> 
> 
> memory_hotplug.force_memmap_on_memory
> 
> "Enable the memmap on memory feature even if it could result in memory waste due to memmap size limitations. For example, if the memmap for a memory block requires 1 MiB, but the pageblock size is 2 MiB, 1 MiB
> of hotplugged memory will be wasted. Note that there are still cases where the feature cannot be enforced: for example, if the memmap is smaller than a single page, or if the architecture does not support the forced mode in all configurations."
> 
> Thoughts?
> 

With module parameter, do we still need the Kconfig option? 

-aneesh
David Hildenbrand July 12, 2023, 7:22 a.m. UTC | #5
On 12.07.23 05:16, Aneesh Kumar K V wrote:
> On 7/11/23 10:49 PM, David Hildenbrand wrote:
>> On 11.07.23 06:48, Aneesh Kumar K.V wrote:
>>> Add a new kconfig option that can be selected if we want to allow
>>> pageblock alignment by reserving pages in the vmemmap altmap area.
>>> This implies we will be reserving some pages for every memoryblock
>>> This also allows the memmap on memory feature to be widely useful
>>> with different memory block size values.
>>
>> "reserving pages" is a nice way of saying "wasting memory". :) Let's spell that out.
>>
>> I think we have to find a better name for this, and I think we should have a toggle similar to memory_hotplug.memmap_on_memory. This should be an admin decision, not some kernel config option.
>>
>>
>> memory_hotplug.force_memmap_on_memory
>>
>> "Enable the memmap on memory feature even if it could result in memory waste due to memmap size limitations. For example, if the memmap for a memory block requires 1 MiB, but the pageblock size is 2 MiB, 1 MiB
>> of hotplugged memory will be wasted. Note that there are still cases where the feature cannot be enforced: for example, if the memmap is smaller than a single page, or if the architecture does not support the forced mode in all configurations."
>>
>> Thoughts?
>>
> 
> With module parameter, do we still need the Kconfig option?

No.

Sleeping over this, maybe we can convert the existing 
memory_hotplug.memmap_on_memory parameter to also accept "force".
Aneesh Kumar K.V July 12, 2023, 1:50 p.m. UTC | #6
David Hildenbrand <david@redhat.com> writes:

> On 12.07.23 05:16, Aneesh Kumar K V wrote:
>> On 7/11/23 10:49 PM, David Hildenbrand wrote:
>>> On 11.07.23 06:48, Aneesh Kumar K.V wrote:
>>>> Add a new kconfig option that can be selected if we want to allow
>>>> pageblock alignment by reserving pages in the vmemmap altmap area.
>>>> This implies we will be reserving some pages for every memoryblock
>>>> This also allows the memmap on memory feature to be widely useful
>>>> with different memory block size values.
>>>
>>> "reserving pages" is a nice way of saying "wasting memory". :) Let's spell that out.
>>>
>>> I think we have to find a better name for this, and I think we should have a toggle similar to memory_hotplug.memmap_on_memory. This should be an admin decision, not some kernel config option.
>>>
>>>
>>> memory_hotplug.force_memmap_on_memory
>>>
>>> "Enable the memmap on memory feature even if it could result in memory waste due to memmap size limitations. For example, if the memmap for a memory block requires 1 MiB, but the pageblock size is 2 MiB, 1 MiB
>>> of hotplugged memory will be wasted. Note that there are still cases where the feature cannot be enforced: for example, if the memmap is smaller than a single page, or if the architecture does not support the forced mode in all configurations."
>>>
>>> Thoughts?
>>>
>> 
>> With module parameter, do we still need the Kconfig option?
>
> No.
>
> Sleeping over this, maybe we can convert the existing 
> memory_hotplug.memmap_on_memory parameter to also accept "force".
>

How about this? 

modified   mm/memory_hotplug.c
@@ -45,13 +45,67 @@
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+enum {
+	MEMMAP_ON_MEMORY_DISABLE = 0,
+	MEMMAP_ON_MEMORY_ENABLE,
+	FORCE_MEMMAP_ON_MEMORY,
+};
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+static const char *memmap_on_memory_to_str[] = {
+	[MEMMAP_ON_MEMORY_DISABLE]  = "disable",
+	[MEMMAP_ON_MEMORY_ENABLE]   = "enable",
+	[FORCE_MEMMAP_ON_MEMORY]    = "force",
+};
+
+static inline unsigned long memory_block_align_base(unsigned long size)
+{
+	if (memmap_mode == FORCE_MEMMAP_ON_MEMORY) {
+		unsigned long align;
+		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+		unsigned long vmemmap_size;
+
+		vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);
+		align = pageblock_align(vmemmap_size) - vmemmap_size;
+		return align;
+	} else
+		return 0;
+}
+
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+	int ret = sysfs_match_string(memmap_on_memory_to_str, val);
+
+	if (ret < 0)
+		return ret;
+	*((int *)kp->arg) = ret;
+	if (ret == FORCE_MEMMAP_ON_MEMORY) {
+		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
+			memory_block_align_base(memory_block_size_bytes()));
+	}
+	return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%s\n", memmap_on_memory_to_str[*((int *)kp->arg)]);
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+	.set = set_memmap_mode,
+	.get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0644);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+	"With value \"force\" it could result in memory waste due to memmap size limitations \n"
+	"For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
+	"size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
+	"still cases where the feature cannot be enforced: for example, if the memmap is \n"
+	"smaller than a single page, or if the architecture does not support the forced \n"
+	"mode in all configurations. (disable/enable/force)");
 
 static inline bool mhp_memmap_on_memory(void)
 {
-	return memmap_on_memory;
+	return !!memmap_mode;
 }
 #else

We can also enable runtime enable/disable/force the feature. We just
need to make sure on try_remove_memory we lookup for altmap correctly.
David Hildenbrand July 12, 2023, 7:06 p.m. UTC | #7
On 12.07.23 15:50, Aneesh Kumar K.V wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 12.07.23 05:16, Aneesh Kumar K V wrote:
>>> On 7/11/23 10:49 PM, David Hildenbrand wrote:
>>>> On 11.07.23 06:48, Aneesh Kumar K.V wrote:
>>>>> Add a new kconfig option that can be selected if we want to allow
>>>>> pageblock alignment by reserving pages in the vmemmap altmap area.
>>>>> This implies we will be reserving some pages for every memoryblock
>>>>> This also allows the memmap on memory feature to be widely useful
>>>>> with different memory block size values.
>>>>
>>>> "reserving pages" is a nice way of saying "wasting memory". :) Let's spell that out.
>>>>
>>>> I think we have to find a better name for this, and I think we should have a toggle similar to memory_hotplug.memmap_on_memory. This should be an admin decision, not some kernel config option.
>>>>
>>>>
>>>> memory_hotplug.force_memmap_on_memory
>>>>
>>>> "Enable the memmap on memory feature even if it could result in memory waste due to memmap size limitations. For example, if the memmap for a memory block requires 1 MiB, but the pageblock size is 2 MiB, 1 MiB
>>>> of hotplugged memory will be wasted. Note that there are still cases where the feature cannot be enforced: for example, if the memmap is smaller than a single page, or if the architecture does not support the forced mode in all configurations."
>>>>
>>>> Thoughts?
>>>>
>>>
>>> With module parameter, do we still need the Kconfig option?
>>
>> No.
>>
>> Sleeping over this, maybe we can convert the existing
>> memory_hotplug.memmap_on_memory parameter to also accept "force".
>>
> 
> How about this?
> 
> modified   mm/memory_hotplug.c
> @@ -45,13 +45,67 @@
>   /*
>    * memory_hotplug.memmap_on_memory parameter
>    */
> -static bool memmap_on_memory __ro_after_init;
> -module_param(memmap_on_memory, bool, 0444);
> -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
> +enum {
> +	MEMMAP_ON_MEMORY_DISABLE = 0,
> +	MEMMAP_ON_MEMORY_ENABLE,
> +	FORCE_MEMMAP_ON_MEMORY,

MEMMAP_ON_MEMORY_FORCE ?

> +};
> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
> +static const char *memmap_on_memory_to_str[] = {
> +	[MEMMAP_ON_MEMORY_DISABLE]  = "disable",
> +	[MEMMAP_ON_MEMORY_ENABLE]   = "enable",
> +	[FORCE_MEMMAP_ON_MEMORY]    = "force",
> +};
> +
> +static inline unsigned long memory_block_align_base(unsigned long size)
> +{
> +	if (memmap_mode == FORCE_MEMMAP_ON_MEMORY) {
> +		unsigned long align;
> +		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +		unsigned long vmemmap_size;
> +
> +		vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);
> +		align = pageblock_align(vmemmap_size) - vmemmap_size;
> +		return align;
> +	} else
> +		return 0;

^ have to see that in action :)

> +}
> +
> +static int set_memmap_mode(const char *val, const struct kernel_param *kp)
> +{
> +	int ret = sysfs_match_string(memmap_on_memory_to_str, val);
> +

That would break existing cmdlines that eat Y/N/0/..., no?

Maybe try parsing "force/FORCE" first and then fallback to the common 
bool parsing (kstrtobool).

Same when printing: handle "force" separately and then just print Y/N 
like param_get_bool() used to do.

So you'd end up with Y/N/FORCE as output and Y/N/0/.../FORCE/force as input.

But I'm happy to hear about alternatives. Maybe a second parameter is 
better ... but what name should it have "memmap_on_memory_force" sounds 
wrong. We'd need a name that expresses that we might be wasting memory, 
hm ...

> +	if (ret < 0)
> +		return ret;
> +	*((int *)kp->arg) = ret;
> +	if (ret == FORCE_MEMMAP_ON_MEMORY) {
> +		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
> +			memory_block_align_base(memory_block_size_bytes()));
> +	}
> +	return 0;
> +}
> +
> +static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
> +{
> +	return sprintf(buffer, "%s\n", memmap_on_memory_to_str[*((int *)kp->arg)]);
> +}
> +
> +static const struct kernel_param_ops memmap_mode_ops = {
> +	.set = set_memmap_mode,
> +	.get = get_memmap_mode,
> +};
> +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0644);
> +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
> +	"With value \"force\" it could result in memory waste due to memmap size limitations \n"
> +	"For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
> +	"size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
> +	"still cases where the feature cannot be enforced: for example, if the memmap is \n"
> +	"smaller than a single page, or if the architecture does not support the forced \n"
> +	"mode in all configurations. (disable/enable/force)");
>   
>   static inline bool mhp_memmap_on_memory(void)
>   {
> -	return memmap_on_memory;
> +	return !!memmap_mode;
>   }
>   #else
> 
> We can also enable runtime enable/disable/force the feature. We just
> need to make sure on try_remove_memory we lookup for altmap correctly.
> 

Yes, that's already been asked for. But let's do that as a separate 
change later.
diff mbox series

Patch

diff --git a/mm/Kconfig b/mm/Kconfig
index 932349271e28..88a1472b2086 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -570,6 +570,15 @@  config MHP_MEMMAP_ON_MEMORY
 	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
 	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 
+config MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY
+       bool "Allow Reserving pages for page block aligment"
+       depends on MHP_MEMMAP_ON_MEMORY
+       help
+	This option allows memmap on memory feature to be more useful
+	with different memory block sizes. This is achieved by marking some pages
+	in each memory block as reserved so that we can get page-block alignment
+	for the remaining pages.
+
 endif # MEMORY_HOTPLUG
 
 config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 07c99b0cc371..f36aec1f7626 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1252,15 +1252,17 @@  static inline bool arch_supports_memmap_on_memory(unsigned long size)
 {
 	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
 	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
-	unsigned long remaining_size = size - vmemmap_size;
 
-	return IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-		IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+	return IS_ALIGNED(vmemmap_size, PMD_SIZE);
 }
 #endif
 
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
+	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+	unsigned long remaining_size = size - vmemmap_size;
+
 	/*
 	 * Besides having arch support and the feature enabled at runtime, we
 	 * need a few more assumptions to hold true:
@@ -1287,9 +1289,30 @@  static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	return mhp_memmap_on_memory() &&
-		size == memory_block_size_bytes() &&
-		arch_supports_memmap_on_memory(size);
+	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+		return false;
+	 /*
+	  * Without page reservation remaining pages should be pageblock aligned.
+	  */
+	if (!IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
+	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
+		return false;
+
+	return arch_supports_memmap_on_memory(size);
+}
+
+static inline unsigned long memory_block_align_base(unsigned long size)
+{
+	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY)) {
+		unsigned long align;
+		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+		unsigned long vmemmap_size;
+
+		vmemmap_size = (nr_vmemmap_pages * sizeof(struct page)) >> PAGE_SHIFT;
+		align = pageblock_align(vmemmap_size) - vmemmap_size;
+		return align;
+	} else
+		return 0;
 }
 
 /*
@@ -1302,7 +1325,11 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
 	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
-	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap mhp_altmap = {
+		.base_pfn =  PHYS_PFN(res->start),
+		.end_pfn  =  PHYS_PFN(res->end),
+		.reserve  = memory_block_align_base(resource_size(res)),
+	};
 	struct memory_group *group = NULL;
 	u64 start, size;
 	bool new_node = false;
@@ -1347,8 +1374,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
 		if (mhp_supports_memmap_on_memory(size)) {
-			mhp_altmap.free = PHYS_PFN(size);
-			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = PHYS_PFN(size) - mhp_altmap.reserve;
 			params.altmap = &mhp_altmap;
 		}
 		/* fallback to not using altmap  */
@@ -1360,7 +1386,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
+	ret = create_memory_block_devices(start, size, mhp_altmap.alloc + mhp_altmap.reserve,
 					  group);
 	if (ret) {
 		arch_remove_memory(start, size, NULL);
@@ -2260,3 +2286,16 @@  int offline_and_remove_memory(u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(offline_and_remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
+
+static int __init memory_hotplug_init(void)
+{
+
+	if (IS_ENABLED(CONFIG_MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY) &&
+	    mhp_memmap_on_memory()) {
+		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
+			memory_block_align_base(memory_block_size_bytes()));
+
+	}
+	return 0;
+}
+module_init(memory_hotplug_init);