diff mbox series

[v4,4/6] mm/hotplug: Allow pageblock alignment via altmap reservation

Message ID 20230718024409.95742-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series Add support for memmap on memory feature on ppc64 | expand

Commit Message

Aneesh Kumar K.V July 18, 2023, 2:44 a.m. UTC
Add a new kconfig option that can be selected if we want to allow
pageblock alignment by reserving pages in the vmemmap altmap area.
This implies we will be reserving some pages for every memoryblock
This also allows the memmap on memory feature to be widely useful
with different memory block size values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/memory_hotplug.c | 109 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 96 insertions(+), 13 deletions(-)

Comments

David Hildenbrand July 24, 2023, 2:33 p.m. UTC | #1
On 18.07.23 04:44, Aneesh Kumar K.V wrote:
> Add a new kconfig option that can be selected if we want to allow

That description seems outdated.

> pageblock alignment by reserving pages in the vmemmap altmap area.
> This implies we will be reserving some pages for every memoryblock
> This also allows the memmap on memory feature to be widely useful
> with different memory block size values.

Can you add some more meat to the description, and especially, in
which cases this might be desired and in which cases it might be
completely undesired?


Let's assume we hotplug a 1 GiB DIMM on arm64/64k. With 512 MiB pageblocks,
we'd waste 50% of the hotplugged memory.

Also, see below on the case where we could end up with 100% wasted memory,
which we want to block compeltely.


Also, I wonder if we can avoid talking about "page reservation" or "altmap reservation",
that's rather an implementation detail.

For example, I'd call this patch

"mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks"



> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   mm/memory_hotplug.c | 109 ++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 96 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 5921c81fcb70..c409f5ff6a59 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -41,17 +41,85 @@
>   #include "internal.h"
>   #include "shuffle.h"
>   
> +enum {
> +	MEMMAP_ON_MEMORY_DISABLE = 0,
> +	MEMMAP_ON_MEMORY_ENABLE,
> +	MEMMAP_ON_MEMORY_FORCE,
> +};
> +
> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
> +
> +static inline unsigned long memory_block_align_base(unsigned long size)
> +{

Can we start with something like this instead?

memory_block_memmap_size() might be reasonable to put into the previous patch.


static inline unsigned long memory_block_memmap_size(void)
{
	return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
}

/*
  * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
  * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
  * aligned, however, we might waste memory.
  */
static inline unsigned long memory_block_memmap_on_memory_size(void)
{
	unsigned long size = memory_block_memmap_size();

	if (memmap_mode != MEMMAP_ON_MEMORY_FORCE)
		return size;
	return ALIGN(size, PFN_PHYS(pageblock_nr_pages));
}
	


> +	if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) {
> +		unsigned long align;
> +		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
> +		unsigned long vmemmap_size;
> +
> +		vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);
> +		align = pageblock_align(vmemmap_size) - vmemmap_size;
> +		return align;
> +	} else
> +		return 0;
> +}
> +
>   #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
>   /*
>    * memory_hotplug.memmap_on_memory parameter
>    */
> -static bool memmap_on_memory __ro_after_init;
> -module_param(memmap_on_memory, bool, 0444);
> -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
> +static int set_memmap_mode(const char *val, const struct kernel_param *kp)
> +{
> +	int ret, mode;
> +	bool enabled;
> +
> +	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
> +		mode =  MEMMAP_ON_MEMORY_FORCE;
> +		goto matched;
> +	}
> +
> +	ret = kstrtobool(val, &enabled);
> +	if (ret < 0)
> +		return ret;
> +	if (enabled)
> +		mode =  MEMMAP_ON_MEMORY_ENABLE;
> +	else
> +		mode =  MEMMAP_ON_MEMORY_DISABLE;
> +
> +matched:
> +	*((int *)kp->arg) =  mode;
> +	if (mode == MEMMAP_ON_MEMORY_FORCE) {
> +		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
> +			memory_block_align_base(memory_block_size_bytes()));
> +	}
> +	return 0;
> +}
> +
> +static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
> +{
> +	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
> +		return sprintf(buffer,  "force\n");
> +	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE)
> +		return sprintf(buffer,  "y\n");
> +
> +	return sprintf(buffer,  "n\n");

param_get_bool() uses uppercase Y / N. Maybe just return the uppercase variants here as well.

> +}
> +
> +static const struct kernel_param_ops memmap_mode_ops = {
> +	.set = set_memmap_mode,
> +	.get = get_memmap_mode,
> +};
> +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
> +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
> +	"With value \"force\" it could result in memory wastage due to memmap size limitations \n"
> +	"For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
> +	"size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
> +	"still cases where the feature cannot be enforced: for example, if the memmap is \n"
> +	"smaller than a single page, or if the architecture does not support the forced \n"
> +	"mode in all configurations. (y/n/force)");

That's a bit mouthful. Can we simplify and put the full doc into

Documentation/admin-guide/mm/memory-hotplug.rst

?

>   
>   static inline bool mhp_memmap_on_memory(void)
>   {
> -	return memmap_on_memory;
> +	return !!memmap_mode;

Maybe better  "memmap_mode != MEMMAP_ON_MEMORY_DISABLE"

>   }
>   #else
>   static inline bool mhp_memmap_on_memory(void)
> @@ -1264,7 +1332,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
>   
>   static bool mhp_supports_memmap_on_memory(unsigned long size)
>   {
> -
>   	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>   	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>   	unsigned long remaining_size = size - vmemmap_size;
> @@ -1295,10 +1362,23 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>   	 *       altmap as an alternative source of memory, and we do not exactly
>   	 *       populate a single PMD.
>   	 */
> -	return mhp_memmap_on_memory() &&
> -	       size == memory_block_size_bytes() &&
> -	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
> -	       arch_supports_memmap_on_memory(size);
> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
> +		return false;
> +
> +	/*
> +	 * Make sure the vmemmap allocation is fully contained
> +	 * so that we always allocate vmemmap memory from altmap area.
> +	 */
> +	if (!IS_ALIGNED(vmemmap_size,  PAGE_SIZE))
> +		return false;
> +	 /*
> +	  * Without page reservation remaining pages should be pageblock aligned.
> +	  */
> +	if (memmap_mode != MEMMAP_ON_MEMORY_FORCE &&
> +	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
> +		return false;

With our new helper, this becomes the following:

memmap_on_memory_size = memory_block_memmap_on_memory_size();

if (!IS_ALIGNED(memmap_on_memory_size, PFN_PHYS(pageblock_nr_pages))
	/* We're not allowed to waste any memory for the memmap. */
	return false;

if (memmap_on_memory_size == memory_block_size_bytes())
	/* No effective hotplugged memory doesn't make sense. */
	return false;	

> +	return arch_supports_memmap_on_memory(size);
>   }
>   
>   /*
> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>   {
>   	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>   	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> -	struct vmem_altmap mhp_altmap = {};
> +	struct vmem_altmap mhp_altmap = {
> +		.base_pfn =  PHYS_PFN(res->start),
> +		.end_pfn  =  PHYS_PFN(res->end),
> +		.reserve  = memory_block_align_base(resource_size(res)),

Can you remind me why we have to set reserve here at all?

IOW, can't we simply set

.free = memory_block_memmap_on_memory_size();

end then pass

mhp_altmap.alloc + mhp_altmap.free

to create_memory_block_devices() instead?
Aneesh Kumar K.V July 24, 2023, 3:16 p.m. UTC | #2
On 7/24/23 8:03 PM, David Hildenbrand wrote:
> On 18.07.23 04:44, Aneesh Kumar K.V wrote:
>> Add a new kconfig option that can be selected if we want to allow
> 
> That description seems outdated.
> 


Will update

>> pageblock alignment by reserving pages in the vmemmap altmap area.
>> This implies we will be reserving some pages for every memoryblock
>> This also allows the memmap on memory feature to be widely useful
>> with different memory block size values.
> 
> Can you add some more meat to the description, and especially, in
> which cases this might be desired and in which cases it might be
> completely undesired?
> 
> 
> Let's assume we hotplug a 1 GiB DIMM on arm64/64k. With 512 MiB pageblocks,
> we'd waste 50% of the hotplugged memory.
> 
> Also, see below on the case where we could end up with 100% wasted memory,
> which we want to block compeltely.
> 
> 
> Also, I wonder if we can avoid talking about "page reservation" or "altmap reservation",
> that's rather an implementation detail.
> 
> For example, I'd call this patch
> 
> "mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks"
> 
> 

Ok will update

> 
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   mm/memory_hotplug.c | 109 ++++++++++++++++++++++++++++++++++++++------
>>   1 file changed, 96 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 5921c81fcb70..c409f5ff6a59 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -41,17 +41,85 @@
>>   #include "internal.h"
>>   #include "shuffle.h"
>>   +enum {
>> +    MEMMAP_ON_MEMORY_DISABLE = 0,
>> +    MEMMAP_ON_MEMORY_ENABLE,
>> +    MEMMAP_ON_MEMORY_FORCE,
>> +};
>> +
>> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
>> +
>> +static inline unsigned long memory_block_align_base(unsigned long size)
>> +{
> 
> Can we start with something like this instead?
> 
> memory_block_memmap_size() might be reasonable to put into the previous patch.
> 
> 
> static inline unsigned long memory_block_memmap_size(void)
> {
>     return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
> }
> 
> /*
>  * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
>  * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
>  * aligned, however, we might waste memory.
>  */

I am finding that confusing. We do want things to be pageblock_nr_pages aligned both ways. 
With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for memmap and
in the default case we do that by making sure only memory blocks of specific size supporting
that alignment can use MEMMAP_ON_MEMORY feature. 

> static inline unsigned long memory_block_memmap_on_memory_size(void)
> {
>     unsigned long size = memory_block_memmap_size();
> 
>     if (memmap_mode != MEMMAP_ON_MEMORY_FORCE)
>         return size;
>     return ALIGN(size, PFN_PHYS(pageblock_nr_pages));
> }
>     
> 
> 
>> +    if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) {
>> +        unsigned long align;
>> +        unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>> +        unsigned long vmemmap_size;
>> +
>> +        vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);
>> +        align = pageblock_align(vmemmap_size) - vmemmap_size;
>> +        return align;
>> +    } else
>> +        return 0;
>> +}
>> +
>>   #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
>>   /*
>>    * memory_hotplug.memmap_on_memory parameter
>>    */
>> -static bool memmap_on_memory __ro_after_init;
>> -module_param(memmap_on_memory, bool, 0444);
>> -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
>> +static int set_memmap_mode(const char *val, const struct kernel_param *kp)
>> +{
>> +    int ret, mode;
>> +    bool enabled;
>> +
>> +    if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
>> +        mode =  MEMMAP_ON_MEMORY_FORCE;
>> +        goto matched;
>> +    }
>> +
>> +    ret = kstrtobool(val, &enabled);
>> +    if (ret < 0)
>> +        return ret;
>> +    if (enabled)
>> +        mode =  MEMMAP_ON_MEMORY_ENABLE;
>> +    else
>> +        mode =  MEMMAP_ON_MEMORY_DISABLE;
>> +
>> +matched:
>> +    *((int *)kp->arg) =  mode;
>> +    if (mode == MEMMAP_ON_MEMORY_FORCE) {
>> +        pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
>> +            memory_block_align_base(memory_block_size_bytes()));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
>> +{
>> +    if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
>> +        return sprintf(buffer,  "force\n");
>> +    if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE)
>> +        return sprintf(buffer,  "y\n");
>> +
>> +    return sprintf(buffer,  "n\n");
> 
> param_get_bool() uses uppercase Y / N. Maybe just return the uppercase variants here as well.
> 
>> +}
>> +
>> +static const struct kernel_param_ops memmap_mode_ops = {
>> +    .set = set_memmap_mode,
>> +    .get = get_memmap_mode,
>> +};
>> +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
>> +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
>> +    "With value \"force\" it could result in memory wastage due to memmap size limitations \n"
>> +    "For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
>> +    "size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
>> +    "still cases where the feature cannot be enforced: for example, if the memmap is \n"
>> +    "smaller than a single page, or if the architecture does not support the forced \n"
>> +    "mode in all configurations. (y/n/force)");
> 
> That's a bit mouthful. Can we simplify and put the full doc into
> 
> Documentation/admin-guide/mm/memory-hotplug.rst
> 
> ?


Will update

> 
>>     static inline bool mhp_memmap_on_memory(void)
>>   {
>> -    return memmap_on_memory;
>> +    return !!memmap_mode;
> 
> Maybe better  "memmap_mode != MEMMAP_ON_MEMORY_DISABLE"
> 

Will update

>>   }
>>   #else
>>   static inline bool mhp_memmap_on_memory(void)
>> @@ -1264,7 +1332,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
>>     static bool mhp_supports_memmap_on_memory(unsigned long size)
>>   {
>> -
>>       unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
>>       unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>>       unsigned long remaining_size = size - vmemmap_size;
>> @@ -1295,10 +1362,23 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>>        *       altmap as an alternative source of memory, and we do not exactly
>>        *       populate a single PMD.
>>        */
>> -    return mhp_memmap_on_memory() &&
>> -           size == memory_block_size_bytes() &&
>> -           IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
>> -           arch_supports_memmap_on_memory(size);
>> +    if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
>> +        return false;
>> +
>> +    /*
>> +     * Make sure the vmemmap allocation is fully contained
>> +     * so that we always allocate vmemmap memory from altmap area.
>> +     */
>> +    if (!IS_ALIGNED(vmemmap_size,  PAGE_SIZE))
>> +        return false;
>> +     /*
>> +      * Without page reservation remaining pages should be pageblock aligned.
>> +      */
>> +    if (memmap_mode != MEMMAP_ON_MEMORY_FORCE &&
>> +        !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
>> +        return false;
> 
> With our new helper, this becomes the following:
> 
> memmap_on_memory_size = memory_block_memmap_on_memory_size();
> 
> if (!IS_ALIGNED(memmap_on_memory_size, PFN_PHYS(pageblock_nr_pages))
>     /* We're not allowed to waste any memory for the memmap. */
>     return false;
> 
> if (memmap_on_memory_size == memory_block_size_bytes())
>     /* No effective hotplugged memory doesn't make sense. */
>     return false;   
> 

Wil update

>> +    return arch_supports_memmap_on_memory(size);
>>   }
>>     /*
>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>   {
>>       struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>       enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>> -    struct vmem_altmap mhp_altmap = {};
>> +    struct vmem_altmap mhp_altmap = {
>> +        .base_pfn =  PHYS_PFN(res->start),
>> +        .end_pfn  =  PHYS_PFN(res->end),
>> +        .reserve  = memory_block_align_base(resource_size(res)),
> 
> Can you remind me why we have to set reserve here at all?
> 
> IOW, can't we simply set
> 
> .free = memory_block_memmap_on_memory_size();
> 
> end then pass
> 
> mhp_altmap.alloc + mhp_altmap.free
> 
> to create_memory_block_devices() instead?
> 

But with the dax usage of altmap, altmap->reserve is what we use to reserve things to get
the required alignment. One difference is where we allocate the struct page at. For this specific
case it should not matter. 

static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
{
	return altmap->base_pfn + altmap->reserve + altmap->alloc
		+ altmap->align;
}

And other is where we online a memory block

We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;  

Considering altmap->reserve is what dax pfn_dev use, is there a reason you want to use altmap->free for this? 
I find it confusing to update free when we haven't allocated any altmap blocks yet.

-aneesh
David Hildenbrand July 24, 2023, 3:41 p.m. UTC | #3
On 24.07.23 17:16, Aneesh Kumar K V wrote:

>>
>> /*
>>   * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
>>   * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
>>   * aligned, however, we might waste memory.
>>   */
> 
> I am finding that confusing. We do want things to be pageblock_nr_pages aligned both ways.
> With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for memmap and
> in the default case we do that by making sure only memory blocks of specific size supporting
> that alignment can use MEMMAP_ON_MEMORY feature.

See the usage inm hp_supports_memmap_on_memory(), I guess that makes 
sense then.

But if you have any ideas on how to clarify that (terminology), I'm all 
ears!

[...]

>>> +    return arch_supports_memmap_on_memory(size);
>>>    }
>>>      /*
>>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>    {
>>>        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>        enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>> -    struct vmem_altmap mhp_altmap = {};
>>> +    struct vmem_altmap mhp_altmap = {
>>> +        .base_pfn =  PHYS_PFN(res->start),
>>> +        .end_pfn  =  PHYS_PFN(res->end),
>>> +        .reserve  = memory_block_align_base(resource_size(res)),
>>
>> Can you remind me why we have to set reserve here at all?
>>
>> IOW, can't we simply set
>>
>> .free = memory_block_memmap_on_memory_size();
>>
>> end then pass
>>
>> mhp_altmap.alloc + mhp_altmap.free
>>
>> to create_memory_block_devices() instead?
>>
> 
> But with the dax usage of altmap, altmap->reserve is what we use to reserve things to get
> the required alignment. One difference is where we allocate the struct page at. For this specific
> case it should not matter.
> 
> static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
> {
> 	return altmap->base_pfn + altmap->reserve + altmap->alloc
> 		+ altmap->align;
> }
> 
> And other is where we online a memory block
> 
> We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;
> 
> Considering altmap->reserve is what dax pfn_dev use, is there a reason you want to use altmap->free for this?

"Reserve" is all about "reserving that much memory for driver usage".

We don't care about that. We simply want vmemmap allocations coming from 
the pageblock(s) we set aside. Where exactly, we don't care.

> I find it confusing to update free when we haven't allocated any altmap blocks yet.

"
@reserve: pages mapped, but reserved for driver use (relative to @base)"
@free: free pages set aside in the mapping for memmap storage
@alloc: track pages consumed, private to vmemmap_populate()
"

To me, that implies that we can ignore "reserve". We set @free to the 
aligned value and let the vmemmap get allocated from anything in there.

free + alloc should always sum up to our set-aside pageblock(s), no?
Aneesh Kumar K.V July 24, 2023, 4:02 p.m. UTC | #4
On 7/24/23 9:11 PM, David Hildenbrand wrote:
> On 24.07.23 17:16, Aneesh Kumar K V wrote:
> 
>>>
>>> /*
>>>   * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
>>>   * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
>>>   * aligned, however, we might waste memory.
>>>   */
>>
>> I am finding that confusing. We do want things to be pageblock_nr_pages aligned both ways.
>> With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for memmap and
>> in the default case we do that by making sure only memory blocks of specific size supporting
>> that alignment can use MEMMAP_ON_MEMORY feature.
> 
> See the usage inm hp_supports_memmap_on_memory(), I guess that makes sense then.
> 
> But if you have any ideas on how to clarify that (terminology), I'm all ears!
> 


I updated the commit message 

mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps to align the start addr to be
page block aligned with different memory block sizes. This implies the
kernel will be reserving some pages for every memoryblock. This also
allows the memmap on memory feature to be widely useful with different
memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.


Also while implementing your  suggestion to use memory_block_memmap_on_memory_size()
I am finding it not really useful because in mhp_supports_memmap_on_memory() we are checking
if remaining_size is pageblock_nr_pages aligned (dax_kmem may want to use that helper
later). Also I still think altmap.reserve is easier because of the start_pfn calculation.
(more on this below)



> [...]
> 
>>>> +    return arch_supports_memmap_on_memory(size);
>>>>    }
>>>>      /*
>>>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>>    {
>>>>        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>>        enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>>> -    struct vmem_altmap mhp_altmap = {};
>>>> +    struct vmem_altmap mhp_altmap = {
>>>> +        .base_pfn =  PHYS_PFN(res->start),
>>>> +        .end_pfn  =  PHYS_PFN(res->end),
>>>> +        .reserve  = memory_block_align_base(resource_size(res)),
>>>
>>> Can you remind me why we have to set reserve here at all?
>>>
>>> IOW, can't we simply set
>>>
>>> .free = memory_block_memmap_on_memory_size();
>>>
>>> end then pass
>>>
>>> mhp_altmap.alloc + mhp_altmap.free
>>>
>>> to create_memory_block_devices() instead?
>>>
>>
>> But with the dax usage of altmap, altmap->reserve is what we use to reserve things to get
>> the required alignment. One difference is where we allocate the struct page at. For this specific
>> case it should not matter.
>>
>> static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
>> {
>>     return altmap->base_pfn + altmap->reserve + altmap->alloc
>>         + altmap->align;
>> }
>>
>> And other is where we online a memory block
>>
>> We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;
>>
>> Considering altmap->reserve is what dax pfn_dev use, is there a reason you want to use altmap->free for this?
> 
> "Reserve" is all about "reserving that much memory for driver usage".
> 
> We don't care about that. We simply want vmemmap allocations coming from the pageblock(s) we set aside. Where exactly, we don't care.
> 
>> I find it confusing to update free when we haven't allocated any altmap blocks yet.
> 
> "
> @reserve: pages mapped, but reserved for driver use (relative to @base)"
> @free: free pages set aside in the mapping for memmap storage
> @alloc: track pages consumed, private to vmemmap_populate()
> "
> 
> To me, that implies that we can ignore "reserve". We set @free to the aligned value and let the vmemmap get allocated from anything in there.
> 
> free + alloc should always sum up to our set-aside pageblock(s), no?
> 
>

The difference is 

 mhp_altmap.free = PHYS_PFN(size) - reserved blocks;

ie, with 256MiB memory block size with 64K pages, we need 4 memmap pages and we reserve 28 pages for aligment.

mhp_altmap.free = PHYS_PFN(size) - 28. 

So that 4 pages from which we are allocating the memmap pages are still counted in free page.

We could all make it work by doing

mhp_altmap.free = PHYS_PFN(size) -  (memory_block_memmap_on_memory_size() - memory_block_memmap_size())

But is that any better than what we have now? I understand the term "reserved for driver use" is confusing for this use case.
But it is really reserving things for required alignment. 

-aneesh
David Hildenbrand July 24, 2023, 4:24 p.m. UTC | #5
On 24.07.23 18:02, Aneesh Kumar K V wrote:
> On 7/24/23 9:11 PM, David Hildenbrand wrote:
>> On 24.07.23 17:16, Aneesh Kumar K V wrote:
>>
>>>>
>>>> /*
>>>>    * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
>>>>    * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
>>>>    * aligned, however, we might waste memory.
>>>>    */
>>>
>>> I am finding that confusing. We do want things to be pageblock_nr_pages aligned both ways.
>>> With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for memmap and
>>> in the default case we do that by making sure only memory blocks of specific size supporting
>>> that alignment can use MEMMAP_ON_MEMORY feature.
>>
>> See the usage inm hp_supports_memmap_on_memory(), I guess that makes sense then.
>>
>> But if you have any ideas on how to clarify that (terminology), I'm all ears!
>>
> 
> 
> I updated the commit message
> 
> mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks
> 
> Currently, memmap_on_memory feature is only supported with memory block
> sizes that result in vmemmap pages covering full page blocks. This is
> because memory onlining/offlining code requires applicable ranges to be
> pageblock-aligned, for example, to set the migratetypes properly.
> 
> This patch helps to lift that restriction by reserving more pages than
> required for vmemmap space. This helps to align the start addr to be
> page block aligned with different memory block sizes. This implies the
> kernel will be reserving some pages for every memoryblock. This also
> allows the memmap on memory feature to be widely useful with different
> memory block size values.
> 
> For ex: with 64K page size and 256MiB memory block size, we require 4
> pages to map vmemmap pages, To align things correctly we end up adding a
> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
> 
> 

Much better.

> Also while implementing your  suggestion to use memory_block_memmap_on_memory_size()
> I am finding it not really useful because in mhp_supports_memmap_on_memory() we are checking
> if remaining_size is pageblock_nr_pages aligned (dax_kmem may want to use that helper
> later).

Let's focus on this patchset here first.

Factoring out how manye memmap pages we actually need vs. how many pages 
we need when aligning up sound very reasonable to me.


Can you elaborate what the problem is?

> Also I still think altmap.reserve is easier because of the start_pfn calculation.
> (more on this below)

Can you elaborate? Do you mean the try_remove_memory() change?

> 
> 
>> [...]
>>
>>>>> +    return arch_supports_memmap_on_memory(size);
>>>>>     }
>>>>>       /*
>>>>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>>>     {
>>>>>         struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>>>         enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>>>> -    struct vmem_altmap mhp_altmap = {};
>>>>> +    struct vmem_altmap mhp_altmap = {
>>>>> +        .base_pfn =  PHYS_PFN(res->start),
>>>>> +        .end_pfn  =  PHYS_PFN(res->end),
>>>>> +        .reserve  = memory_block_align_base(resource_size(res)),
>>>>
>>>> Can you remind me why we have to set reserve here at all?
>>>>
>>>> IOW, can't we simply set
>>>>
>>>> .free = memory_block_memmap_on_memory_size();
>>>>
>>>> end then pass
>>>>
>>>> mhp_altmap.alloc + mhp_altmap.free
>>>>
>>>> to create_memory_block_devices() instead?
>>>>
>>>
>>> But with the dax usage of altmap, altmap->reserve is what we use to reserve things to get
>>> the required alignment. One difference is where we allocate the struct page at. For this specific
>>> case it should not matter.
>>>
>>> static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
>>> {
>>>      return altmap->base_pfn + altmap->reserve + altmap->alloc
>>>          + altmap->align;
>>> }
>>>
>>> And other is where we online a memory block
>>>
>>> We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;
>>>
>>> Considering altmap->reserve is what dax pfn_dev use, is there a reason you want to use altmap->free for this?
>>
>> "Reserve" is all about "reserving that much memory for driver usage".
>>
>> We don't care about that. We simply want vmemmap allocations coming from the pageblock(s) we set aside. Where exactly, we don't care.
>>
>>> I find it confusing to update free when we haven't allocated any altmap blocks yet.
>>
>> "
>> @reserve: pages mapped, but reserved for driver use (relative to @base)"
>> @free: free pages set aside in the mapping for memmap storage
>> @alloc: track pages consumed, private to vmemmap_populate()
>> "
>>
>> To me, that implies that we can ignore "reserve". We set @free to the aligned value and let the vmemmap get allocated from anything in there.
>>
>> free + alloc should always sum up to our set-aside pageblock(s), no?
>>
>>
> 
> The difference is
> 
>   mhp_altmap.free = PHYS_PFN(size) - reserved blocks;
> 
> ie, with 256MiB memory block size with 64K pages, we need 4 memmap pages and we reserve 28 pages for aligment.
> 
> mhp_altmap.free = PHYS_PFN(size) - 28.
> 
> So that 4 pages from which we are allocating the memmap pages are still counted in free page.
> 
> We could all make it work by doing
> 
> mhp_altmap.free = PHYS_PFN(size) -  (memory_block_memmap_on_memory_size() - memory_block_memmap_size())
> 
> But is that any better than what we have now? I understand the term "reserved for driver use" is confusing for this use case.
> But it is really reserving things for required alignment.


Let's take a step back.

altmap->alloc tells us how much was already allocated.

altmap->free tells us how much memory we can allocate at max (confusing, 
but see vmem_altmap_nr_free()).

altmap->free should actually have been called differently.


I think it's currently even *wrong* to set free = PHYS_PFN(size). We 
don't want to allocate beyond the first pageblock(s) we selected.


Can't we set:

1) add_memory_resource():

	.base_pfn = PHYS_PFN(start);
	.free = PHYS_PFN(memory_block_memmap_on_memory_size());

2) try_remove_memory():
	.base_pfn = PHYS_PFN(start);
	.alloc = PHYS_PFN(memory_block_memmap_on_memory_size());

Faking that all was allocated and avoiding any reservation terminology?
Aneesh Kumar K.V July 24, 2023, 5:29 p.m. UTC | #6
David Hildenbrand <david@redhat.com> writes:

> On 24.07.23 18:02, Aneesh Kumar K V wrote:
>> On 7/24/23 9:11 PM, David Hildenbrand wrote:
>>> On 24.07.23 17:16, Aneesh Kumar K V wrote:
>>>
>>>>>
>>>>> /*
>>>>>    * In "forced" memmap_on_memory mode, we always align the vmemmap size up to cover
>>>>>    * full pageblocks. That way, we can add memory even if the vmemmap size is not properly
>>>>>    * aligned, however, we might waste memory.
>>>>>    */
>>>>
>>>> I am finding that confusing. We do want things to be pageblock_nr_pages aligned both ways.
>>>> With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for memmap and
>>>> in the default case we do that by making sure only memory blocks of specific size supporting
>>>> that alignment can use MEMMAP_ON_MEMORY feature.
>>>
>>> See the usage inm hp_supports_memmap_on_memory(), I guess that makes sense then.
>>>
>>> But if you have any ideas on how to clarify that (terminology), I'm all ears!
>>>
>> 
>> 
>> I updated the commit message
>> 
>> mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks
>> 
>> Currently, memmap_on_memory feature is only supported with memory block
>> sizes that result in vmemmap pages covering full page blocks. This is
>> because memory onlining/offlining code requires applicable ranges to be
>> pageblock-aligned, for example, to set the migratetypes properly.
>> 
>> This patch helps to lift that restriction by reserving more pages than
>> required for vmemmap space. This helps to align the start addr to be
>> page block aligned with different memory block sizes. This implies the
>> kernel will be reserving some pages for every memoryblock. This also
>> allows the memmap on memory feature to be widely useful with different
>> memory block size values.
>> 
>> For ex: with 64K page size and 256MiB memory block size, we require 4
>> pages to map vmemmap pages, To align things correctly we end up adding a
>> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
>> 
>> 
>
> Much better.
>
>> Also while implementing your  suggestion to use memory_block_memmap_on_memory_size()
>> I am finding it not really useful because in mhp_supports_memmap_on_memory() we are checking
>> if remaining_size is pageblock_nr_pages aligned (dax_kmem may want to use that helper
>> later).
>
> Let's focus on this patchset here first.
>
> Factoring out how manye memmap pages we actually need vs. how many pages 
> we need when aligning up sound very reasonable to me.
>
>
> Can you elaborate what the problem is?
>
>> Also I still think altmap.reserve is easier because of the start_pfn calculation.
>> (more on this below)
>
> Can you elaborate? Do you mean the try_remove_memory() change?
>
>> 
>> 
>>> [...]
>>>
>>>>>> +    return arch_supports_memmap_on_memory(size);
>>>>>>     }
>>>>>>       /*
>>>>>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>>>>     {
>>>>>>         struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>>>>         enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>>>>> -    struct vmem_altmap mhp_altmap = {};
>>>>>> +    struct vmem_altmap mhp_altmap = {
>>>>>> +        .base_pfn =  PHYS_PFN(res->start),
>>>>>> +        .end_pfn  =  PHYS_PFN(res->end),
>>>>>> +        .reserve  = memory_block_align_base(resource_size(res)),
>>>>>
>>>>> Can you remind me why we have to set reserve here at all?
>>>>>
>>>>> IOW, can't we simply set
>>>>>
>>>>> .free = memory_block_memmap_on_memory_size();
>>>>>
>>>>> end then pass
>>>>>
>>>>> mhp_altmap.alloc + mhp_altmap.free
>>>>>
>>>>> to create_memory_block_devices() instead?
>>>>>
>>>>
>>>> But with the dax usage of altmap, altmap->reserve is what we use to reserve things to get
>>>> the required alignment. One difference is where we allocate the struct page at. For this specific
>>>> case it should not matter.
>>>>
>>>> static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
>>>> {
>>>>      return altmap->base_pfn + altmap->reserve + altmap->alloc
>>>>          + altmap->align;
>>>> }
>>>>
>>>> And other is where we online a memory block
>>>>
>>>> We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;
>>>>
>>>> Considering altmap->reserve is what dax pfn_dev use, is there a reason you want to use altmap->free for this?
>>>
>>> "Reserve" is all about "reserving that much memory for driver usage".
>>>
>>> We don't care about that. We simply want vmemmap allocations coming from the pageblock(s) we set aside. Where exactly, we don't care.
>>>
>>>> I find it confusing to update free when we haven't allocated any altmap blocks yet.
>>>
>>> "
>>> @reserve: pages mapped, but reserved for driver use (relative to @base)"
>>> @free: free pages set aside in the mapping for memmap storage
>>> @alloc: track pages consumed, private to vmemmap_populate()
>>> "
>>>
>>> To me, that implies that we can ignore "reserve". We set @free to the aligned value and let the vmemmap get allocated from anything in there.
>>>
>>> free + alloc should always sum up to our set-aside pageblock(s), no?
>>>
>>>
>> 
>> The difference is
>> 
>>   mhp_altmap.free = PHYS_PFN(size) - reserved blocks;
>> 
>> ie, with 256MiB memory block size with 64K pages, we need 4 memmap pages and we reserve 28 pages for aligment.
>> 
>> mhp_altmap.free = PHYS_PFN(size) - 28.
>> 
>> So that 4 pages from which we are allocating the memmap pages are still counted in free page.
>> 
>> We could all make it work by doing
>> 
>> mhp_altmap.free = PHYS_PFN(size) -  (memory_block_memmap_on_memory_size() - memory_block_memmap_size())
>> 
>> But is that any better than what we have now? I understand the term "reserved for driver use" is confusing for this use case.
>> But it is really reserving things for required alignment.
>
>
> Let's take a step back.
>
> altmap->alloc tells us how much was already allocated.
>
> altmap->free tells us how much memory we can allocate at max (confusing, 
> but see vmem_altmap_nr_free()).
>
> altmap->free should actually have been called differently.
>
>
> I think it's currently even *wrong* to set free = PHYS_PFN(size). We 
> don't want to allocate beyond the first pageblock(s) we selected.
>

You are correct. The calculation of altmap.free was wrong.
It was wrong upstream and also had wrong computation in the ppc64
upstream code.

modified   arch/powerpc/mm/init_64.c
@@ -326,8 +326,7 @@ void __ref __vmemmap_free(unsigned long start, unsigned long end,
 	start = ALIGN_DOWN(start, page_size);
 	if (altmap) {
 		alt_start = altmap->base_pfn;
-		alt_end = altmap->base_pfn + altmap->reserve +
-			  altmap->free + altmap->alloc + altmap->align;
+		alt_end = altmap->base_pfn + altmap->reserve + altmap->free ;
 	}
 
 	pr_debug("vmemmap_free %lx...%lx\n", start, end);


Fixing all that up the patch is now updated as below

1 file changed, 109 insertions(+), 15 deletions(-)
mm/memory_hotplug.c | 124 +++++++++++++++++++++++++++++++++++++++++++++-------

modified   mm/memory_hotplug.c
@@ -41,17 +41,91 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+	MEMMAP_ON_MEMORY_DISABLE = 0,
+	MEMMAP_ON_MEMORY_ENABLE,
+	MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+    return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_size(void)
+{
+    unsigned long size = memory_block_memmap_size();
+
+    /*
+     * In "forced" memmap_on_memory mode, we add extra pages to align the
+     * vmemmap size up to cover full pageblocks. That way, we can add memory
+     * even if the vmemmap size is not properly aligned, however, we might waste
+     * memory.
+     */
+    if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+	    return ALIGN(size, PFN_PHYS(pageblock_nr_pages));
+    return size;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+	int ret, mode;
+	bool enabled;
+
+	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+		mode =  MEMMAP_ON_MEMORY_FORCE;
+		goto matched;
+	}
+
+	ret = kstrtobool(val, &enabled);
+	if (ret < 0)
+		return ret;
+	if (enabled)
+		mode =  MEMMAP_ON_MEMORY_ENABLE;
+	else
+		mode =  MEMMAP_ON_MEMORY_DISABLE;
+
+matched:
+	*((int *)kp->arg) =  mode;
+	if (mode == MEMMAP_ON_MEMORY_FORCE) {
+		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
+			memory_block_memmap_on_memory_size() - memory_block_memmap_size());
+	}
+	return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+		return sprintf(buffer,  "force\n");
+	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE)
+		return sprintf(buffer,  "y\n");
+
+	return sprintf(buffer,  "n\n");
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+	.set = set_memmap_mode,
+	.get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+	"With value \"force\" it could result in memory wastage due to memmap size limitations \n"
+	"For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
+	"size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
+	"still cases where the feature cannot be enforced: for example, if the memmap is \n"
+	"smaller than a single page, or if the architecture does not support the forced \n"
+	"mode in all configurations. (y/n/force)");
 
 static inline bool mhp_memmap_on_memory(void)
 {
-	return memmap_on_memory;
+	return memmap_mode != MEMMAP_ON_MEMORY_DISABLE;
 }
 #else
 static inline bool mhp_memmap_on_memory(void)
@@ -1266,7 +1340,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
 	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
 	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
-	unsigned long remaining_size = size - vmemmap_size;
+	unsigned long memmap_on_memory_size = memory_block_memmap_on_memory_size();
 
 	/*
 	 * Besides having arch support and the feature enabled at runtime, we
@@ -1294,10 +1368,28 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	return mhp_memmap_on_memory() &&
-	       size == memory_block_size_bytes() &&
-	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
-	       arch_supports_memmap_on_memory(size);
+	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+		return false;
+
+	/*
+	 * Make sure the vmemmap allocation is fully contained
+	 * so that we always allocate vmemmap memory from altmap area.
+	 */
+	if (!IS_ALIGNED(vmemmap_size,  PAGE_SIZE))
+		return false;
+
+	/*
+	 * start pfn should be pageblock_nr_pages aligned for correctly
+	 * setting migrate types
+	 */
+	if (!IS_ALIGNED(memmap_on_memory_size, PFN_PHYS(pageblock_nr_pages)))
+		return false;
+
+	if (memmap_on_memory_size == memory_block_size_bytes())
+		/* No effective hotplugged memory doesn't make sense. */
+		return false;
+
+	return arch_supports_memmap_on_memory(size);
 }
 
 /*
@@ -1310,7 +1402,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
 	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
-	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap mhp_altmap = {
+		.base_pfn =  PHYS_PFN(res->start),
+		.end_pfn  =  PHYS_PFN(res->end),
+	};
 	struct memory_group *group = NULL;
 	u64 start, size;
 	bool new_node = false;
@@ -1355,8 +1450,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
 		if (mhp_supports_memmap_on_memory(size)) {
-			mhp_altmap.free = PHYS_PFN(size);
-			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = PHYS_PFN(memory_block_memmap_on_memory_size());
 			params.altmap = &mhp_altmap;
 		}
 		/* fallback to not using altmap  */
@@ -1368,8 +1462,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
-					  group);
+	ret = create_memory_block_devices(start, size, mhp_altmap.free, group);
 	if (ret) {
 		arch_remove_memory(start, size, NULL);
 		goto error;
@@ -2090,7 +2183,8 @@ static int __ref try_remove_memory(u64 start, u64 size)
 			 * right thing if we used vmem_altmap when hot-adding
 			 * the range.
 			 */
-			mhp_altmap.alloc = nr_vmemmap_pages;
+			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = nr_vmemmap_pages;
 			altmap = &mhp_altmap;
 		}
 	}
diff mbox series

Patch

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5921c81fcb70..c409f5ff6a59 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,85 @@ 
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+	MEMMAP_ON_MEMORY_DISABLE = 0,
+	MEMMAP_ON_MEMORY_ENABLE,
+	MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_align_base(unsigned long size)
+{
+	if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) {
+		unsigned long align;
+		unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+		unsigned long vmemmap_size;
+
+		vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE);
+		align = pageblock_align(vmemmap_size) - vmemmap_size;
+		return align;
+	} else
+		return 0;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+	int ret, mode;
+	bool enabled;
+
+	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+		mode =  MEMMAP_ON_MEMORY_FORCE;
+		goto matched;
+	}
+
+	ret = kstrtobool(val, &enabled);
+	if (ret < 0)
+		return ret;
+	if (enabled)
+		mode =  MEMMAP_ON_MEMORY_ENABLE;
+	else
+		mode =  MEMMAP_ON_MEMORY_DISABLE;
+
+matched:
+	*((int *)kp->arg) =  mode;
+	if (mode == MEMMAP_ON_MEMORY_FORCE) {
+		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
+			memory_block_align_base(memory_block_size_bytes()));
+	}
+	return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+		return sprintf(buffer,  "force\n");
+	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE)
+		return sprintf(buffer,  "y\n");
+
+	return sprintf(buffer,  "n\n");
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+	.set = set_memmap_mode,
+	.get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+	"With value \"force\" it could result in memory wastage due to memmap size limitations \n"
+	"For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n"
+	"size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n"
+	"still cases where the feature cannot be enforced: for example, if the memmap is \n"
+	"smaller than a single page, or if the architecture does not support the forced \n"
+	"mode in all configurations. (y/n/force)");
 
 static inline bool mhp_memmap_on_memory(void)
 {
-	return memmap_on_memory;
+	return !!memmap_mode;
 }
 #else
 static inline bool mhp_memmap_on_memory(void)
@@ -1264,7 +1332,6 @@  static inline bool arch_supports_memmap_on_memory(unsigned long size)
 
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-
 	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
 	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
 	unsigned long remaining_size = size - vmemmap_size;
@@ -1295,10 +1362,23 @@  static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	return mhp_memmap_on_memory() &&
-	       size == memory_block_size_bytes() &&
-	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
-	       arch_supports_memmap_on_memory(size);
+	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+		return false;
+
+	/*
+	 * Make sure the vmemmap allocation is fully contained
+	 * so that we always allocate vmemmap memory from altmap area.
+	 */
+	if (!IS_ALIGNED(vmemmap_size,  PAGE_SIZE))
+		return false;
+	 /*
+	  * Without page reservation remaining pages should be pageblock aligned.
+	  */
+	if (memmap_mode != MEMMAP_ON_MEMORY_FORCE &&
+	    !IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)))
+		return false;
+
+	return arch_supports_memmap_on_memory(size);
 }
 
 /*
@@ -1311,7 +1391,11 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
 	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
-	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap mhp_altmap = {
+		.base_pfn =  PHYS_PFN(res->start),
+		.end_pfn  =  PHYS_PFN(res->end),
+		.reserve  = memory_block_align_base(resource_size(res)),
+	};
 	struct memory_group *group = NULL;
 	u64 start, size;
 	bool new_node = false;
@@ -1356,8 +1440,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
 		if (mhp_supports_memmap_on_memory(size)) {
-			mhp_altmap.free = PHYS_PFN(size);
-			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = PHYS_PFN(size) - mhp_altmap.reserve;
 			params.altmap = &mhp_altmap;
 		}
 		/* fallback to not using altmap  */
@@ -1369,7 +1452,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
+	ret = create_memory_block_devices(start, size, mhp_altmap.alloc + mhp_altmap.reserve,
 					  group);
 	if (ret) {
 		arch_remove_memory(start, size, NULL);