diff mbox series

mm/vmalloc: allocate small pages for area->pages

Message ID dd04f516643fde4206c1fe93818526a768125c75.1638870169.git.xuyu@linux.alibaba.com (mailing list archive)
State New
Headers show
Series mm/vmalloc: allocate small pages for area->pages | expand

Commit Message

Xu Yu Dec. 7, 2021, 9:46 a.m. UTC
The area->pages stores the struct pages allocated for vmalloc mappings.
The allocated memory can be hugepage if arch has HAVE_ARCH_HUGE_VMALLOC
set, while area->pages itself does not have to be hugepage backed.

Suppose that we want to vmalloc 1026M of memory, then area->pages is
2052K in size, which is large than PMD_SIZE when the pagesize is 4K.
Currently, 4096K will be allocated for area->pages, wherein 2044K is
wasted.

This introduces __vmalloc_node_no_huge, and makes area->pages backed by
small pages, because I think to allocate hugepage for area->pages is
unnecessary and vulnerable to abuse.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 include/linux/vmalloc.h |  2 ++
 mm/vmalloc.c            | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

Comments

Nicholas Piggin Dec. 9, 2021, 8:23 a.m. UTC | #1
Excerpts from Xu Yu's message of December 7, 2021 7:46 pm:
> The area->pages stores the struct pages allocated for vmalloc mappings.
> The allocated memory can be hugepage if arch has HAVE_ARCH_HUGE_VMALLOC
> set, while area->pages itself does not have to be hugepage backed.
> 
> Suppose that we want to vmalloc 1026M of memory, then area->pages is
> 2052K in size, which is large than PMD_SIZE when the pagesize is 4K.
> Currently, 4096K will be allocated for area->pages, wherein 2044K is
> wasted.
> 
> This introduces __vmalloc_node_no_huge, and makes area->pages backed by
> small pages, because I think to allocate hugepage for area->pages is
> unnecessary and vulnerable to abuse.

Any vmalloc allocation will be subject to internal fragmentation like
this. What makes this one special? Is there a way to improve it for
all with some heuristic?

There would be an argument for a size-optimised vmalloc vs a space 
optimised one. An accounting strucutre like this doesn't matter
much for speed. A vfs hash table does. Is it worth doing though? How
much do you gain in practice?

Thanks,
Nick

> 
> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
> ---
>  include/linux/vmalloc.h |  2 ++
>  mm/vmalloc.c            | 15 ++++++++++++---
>  2 files changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 6e022cc712e6..e93f39eb46a5 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -150,6 +150,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  			const void *caller) __alloc_size(1);
>  void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
>  		int node, const void *caller) __alloc_size(1);
> +void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
> +		gfp_t gfp_mask, int node, const void *caller) __alloc_size(1);
>  void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
>  
>  extern void vfree(const void *addr);
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d2a00ad4e1dd..0bdbb96d3e3f 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2925,17 +2925,18 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	unsigned long size = get_vm_area_size(area);
>  	unsigned long array_size;
>  	unsigned int nr_small_pages = size >> PAGE_SHIFT;
> +	unsigned int max_small_pages = ALIGN(size, 1UL << page_shift) >> PAGE_SHIFT;
>  	unsigned int page_order;
>  
> -	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
> +	array_size = (unsigned long)max_small_pages * sizeof(struct page *);
>  	gfp_mask |= __GFP_NOWARN;
>  	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
>  		gfp_mask |= __GFP_HIGHMEM;
>  
>  	/* Please note that the recursion is strictly bounded. */
>  	if (array_size > PAGE_SIZE) {
> -		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
> -					area->caller);
> +		area->pages = __vmalloc_node_no_huge(array_size, 1, nested_gfp,
> +						     node, area->caller);
>  	} else {
>  		area->pages = kmalloc_node(array_size, nested_gfp, node);
>  	}
> @@ -3114,6 +3115,14 @@ void *__vmalloc_node(unsigned long size, unsigned long align,
>  	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
>  				gfp_mask, PAGE_KERNEL, 0, node, caller);
>  }
> +
> +void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
> +			     gfp_t gfp_mask, int node, const void *caller)
> +{
> +	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
> +				gfp_mask, PAGE_KERNEL, VM_NO_HUGE_VMAP, node, caller);
> +}
> +
>  /*
>   * This is only for performance analysis of vmalloc and stress purpose.
>   * It is required by vmalloc test module, therefore do not use it other
> -- 
> 2.20.1.2432.ga663e714
> 
>
Xu Yu Dec. 9, 2021, 9:27 a.m. UTC | #2
On 12/9/21 4:23 PM, Nicholas Piggin wrote:
> Excerpts from Xu Yu's message of December 7, 2021 7:46 pm:
>> The area->pages stores the struct pages allocated for vmalloc mappings.
>> The allocated memory can be hugepage if arch has HAVE_ARCH_HUGE_VMALLOC
>> set, while area->pages itself does not have to be hugepage backed.
>>
>> Suppose that we want to vmalloc 1026M of memory, then area->pages is
>> 2052K in size, which is large than PMD_SIZE when the pagesize is 4K.
>> Currently, 4096K will be allocated for area->pages, wherein 2044K is
>> wasted.
>>
>> This introduces __vmalloc_node_no_huge, and makes area->pages backed by
>> small pages, because I think to allocate hugepage for area->pages is
>> unnecessary and vulnerable to abuse.
> 
> Any vmalloc allocation will be subject to internal fragmentation like
> this. What makes this one special? Is there a way to improve it for
> all with some heuristic?

As described in the commit log, I think vmalloc memory (*data*) can be
hugepage, while the area->pages (*meta*) is unnecessary.

There should be heuristic ways, just like THP settings (always, madvise,
never). But such heuristic ways are mainly for data allocation, and I'm
not sure it's worth it to bring such logic in.

> 
> There would be an argument for a size-optimised vmalloc vs a space
> optimised one. An accounting strucutre like this doesn't matter
> much for speed. A vfs hash table does. Is it worth doing though? How

To be honest, I wrote the patch when studying your patchset. No real
issue.

> much do you gain in practice?

Therefore, no actual gain in practice.


Perhaps I should add an RFC tag in the patch. However, I saw that
Andrew Morton has added this patch to the -mm tree.

I wonder if we need to reconsider this patch.

> 
> Thanks,
> Nick
> 
>>
>> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
>> ---
>>   include/linux/vmalloc.h |  2 ++
>>   mm/vmalloc.c            | 15 ++++++++++++---
>>   2 files changed, 14 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 6e022cc712e6..e93f39eb46a5 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -150,6 +150,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
>>   			const void *caller) __alloc_size(1);
>>   void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
>>   		int node, const void *caller) __alloc_size(1);
>> +void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
>> +		gfp_t gfp_mask, int node, const void *caller) __alloc_size(1);
>>   void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
>>   
>>   extern void vfree(const void *addr);
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index d2a00ad4e1dd..0bdbb96d3e3f 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -2925,17 +2925,18 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>   	unsigned long size = get_vm_area_size(area);
>>   	unsigned long array_size;
>>   	unsigned int nr_small_pages = size >> PAGE_SHIFT;
>> +	unsigned int max_small_pages = ALIGN(size, 1UL << page_shift) >> PAGE_SHIFT;
>>   	unsigned int page_order;
>>   
>> -	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
>> +	array_size = (unsigned long)max_small_pages * sizeof(struct page *);
>>   	gfp_mask |= __GFP_NOWARN;
>>   	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
>>   		gfp_mask |= __GFP_HIGHMEM;
>>   
>>   	/* Please note that the recursion is strictly bounded. */
>>   	if (array_size > PAGE_SIZE) {
>> -		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
>> -					area->caller);
>> +		area->pages = __vmalloc_node_no_huge(array_size, 1, nested_gfp,
>> +						     node, area->caller);
>>   	} else {
>>   		area->pages = kmalloc_node(array_size, nested_gfp, node);
>>   	}
>> @@ -3114,6 +3115,14 @@ void *__vmalloc_node(unsigned long size, unsigned long align,
>>   	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
>>   				gfp_mask, PAGE_KERNEL, 0, node, caller);
>>   }
>> +
>> +void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
>> +			     gfp_t gfp_mask, int node, const void *caller)
>> +{
>> +	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
>> +				gfp_mask, PAGE_KERNEL, VM_NO_HUGE_VMAP, node, caller);
>> +}
>> +
>>   /*
>>    * This is only for performance analysis of vmalloc and stress purpose.
>>    * It is required by vmalloc test module, therefore do not use it other
>> -- 
>> 2.20.1.2432.ga663e714
>>
>>
Nicholas Piggin Dec. 9, 2021, 10:59 a.m. UTC | #3
Excerpts from Yu Xu's message of December 9, 2021 7:27 pm:
> On 12/9/21 4:23 PM, Nicholas Piggin wrote:
>> Excerpts from Xu Yu's message of December 7, 2021 7:46 pm:
>>> The area->pages stores the struct pages allocated for vmalloc mappings.
>>> The allocated memory can be hugepage if arch has HAVE_ARCH_HUGE_VMALLOC
>>> set, while area->pages itself does not have to be hugepage backed.
>>>
>>> Suppose that we want to vmalloc 1026M of memory, then area->pages is
>>> 2052K in size, which is large than PMD_SIZE when the pagesize is 4K.
>>> Currently, 4096K will be allocated for area->pages, wherein 2044K is
>>> wasted.
>>>
>>> This introduces __vmalloc_node_no_huge, and makes area->pages backed by
>>> small pages, because I think to allocate hugepage for area->pages is
>>> unnecessary and vulnerable to abuse.
>> 
>> Any vmalloc allocation will be subject to internal fragmentation like
>> this. What makes this one special? Is there a way to improve it for
>> all with some heuristic?
> 
> As described in the commit log, I think vmalloc memory (*data*) can be
> hugepage, while the area->pages (*meta*) is unnecessary.

Right. I accept some vmalloc allocations aren't performance critical
for accesses, and some allocations will have more fragmentation waste
from huge vmalloc, and that area->pages fits both under some
circumstances.

But 1) other vmalloc allocations might too. And 2) sure you can waste
50% of the 0.2% space overhead that area->pages requires so it another
0.2% in the absolute worst case that you're vmallocing just over a gig
of memory.

The question is does this case actually matter that much. And would we
be better served looking at actual wastage numbers across all allocs
and whether it's worth worrying about or if a general heuristic could
be used.

> There should be heuristic ways, just like THP settings (always, madvise,
> never). But such heuristic ways are mainly for data allocation, and I'm
> not sure it's worth it to bring such logic in.
> 
>> 
>> There would be an argument for a size-optimised vmalloc vs a space
>> optimised one. An accounting strucutre like this doesn't matter
>> much for speed. A vfs hash table does. Is it worth doing though? How
> 
> To be honest, I wrote the patch when studying your patchset. No real
> issue.
> 
>> much do you gain in practice?
> 
> Therefore, no actual gain in practice.
> 
> 
> Perhaps I should add an RFC tag in the patch. However, I saw that
> Andrew Morton has added this patch to the -mm tree.
> 
> I wonder if we need to reconsider this patch.

I like your thinking and that you're looking to optimise memory usage.

I'm mainly concerned that the more API options we provide, the harder it 
makes things for callers and the more chance there is that they will get
it wrong. Heuristics aren't a magic fix either (they can go wrong too),
so getting an idea of how much waste we could avoid would be a good
start.

Thanks,
Nick
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6e022cc712e6..e93f39eb46a5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -150,6 +150,8 @@  extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
+void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
+		gfp_t gfp_mask, int node, const void *caller) __alloc_size(1);
 void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
 
 extern void vfree(const void *addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d2a00ad4e1dd..0bdbb96d3e3f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2925,17 +2925,18 @@  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	unsigned long size = get_vm_area_size(area);
 	unsigned long array_size;
 	unsigned int nr_small_pages = size >> PAGE_SHIFT;
+	unsigned int max_small_pages = ALIGN(size, 1UL << page_shift) >> PAGE_SHIFT;
 	unsigned int page_order;
 
-	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
+	array_size = (unsigned long)max_small_pages * sizeof(struct page *);
 	gfp_mask |= __GFP_NOWARN;
 	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
 		gfp_mask |= __GFP_HIGHMEM;
 
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
-		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
-					area->caller);
+		area->pages = __vmalloc_node_no_huge(array_size, 1, nested_gfp,
+						     node, area->caller);
 	} else {
 		area->pages = kmalloc_node(array_size, nested_gfp, node);
 	}
@@ -3114,6 +3115,14 @@  void *__vmalloc_node(unsigned long size, unsigned long align,
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
 				gfp_mask, PAGE_KERNEL, 0, node, caller);
 }
+
+void *__vmalloc_node_no_huge(unsigned long size, unsigned long align,
+			     gfp_t gfp_mask, int node, const void *caller)
+{
+	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+				gfp_mask, PAGE_KERNEL, VM_NO_HUGE_VMAP, node, caller);
+}
+
 /*
  * This is only for performance analysis of vmalloc and stress purpose.
  * It is required by vmalloc test module, therefore do not use it other