diff mbox series

[v4,6/8] hugetlb: batch PMD split for bulk vmemmap dedup

Message ID 20230918230202.254631-7-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Batch hugetlb vmemmap modification operations | expand

Commit Message

Mike Kravetz Sept. 18, 2023, 11:01 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

In an effort to minimize amount of TLB flushes, batch all PMD splits
belonging to a range of pages in order to perform only 1 (global) TLB
flush.

Add a flags field to the walker and pass whether it's a bulk allocation
or just a single page to decide to remap. First value
(VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
flush when we split the PMD.

Rebased and updated by Mike Kravetz

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

Comments

Muchun Song Sept. 19, 2023, 6:27 a.m. UTC | #1
On 2023/9/19 07:01, Mike Kravetz wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
>
> In an effort to minimize amount of TLB flushes, batch all PMD splits
> belonging to a range of pages in order to perform only 1 (global) TLB
> flush.
>
> Add a flags field to the walker and pass whether it's a bulk allocation
> or just a single page to decide to remap. First value
> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
> flush when we split the PMD.
>
> Rebased and updated by Mike Kravetz
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 75 insertions(+), 4 deletions(-)
>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 147ed15bcae4..e8bc2f7567db 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -27,6 +27,7 @@
>    * @reuse_addr:		the virtual address of the @reuse_page page.
>    * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
>    *			or is mapped from.
> + * @flags:		used to modify behavior in bulk operations
>    */
>   struct vmemmap_remap_walk {
>   	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -35,9 +36,11 @@ struct vmemmap_remap_walk {
>   	struct page		*reuse_page;
>   	unsigned long		reuse_addr;
>   	struct list_head	*vmemmap_pages;
> +#define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)

Please add a brief comment following this macro to explain what's the
behavior.

> +	unsigned long		flags;
>   };
>   
> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>   {
>   	pmd_t __pmd;
>   	int i;
> @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>   		/* Make pte visible before pmd. See comment in pmd_install(). */
>   		smp_wmb();
>   		pmd_populate_kernel(&init_mm, pmd, pgtable);
> -		flush_tlb_kernel_range(start, start + PMD_SIZE);
> +		if (flush)
> +			flush_tlb_kernel_range(start, start + PMD_SIZE);
>   	} else {
>   		pte_free_kernel(&init_mm, pgtable);
>   	}
> @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>   	do {
>   		int ret;
>   
> -		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
> +		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
> +				walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);

!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)?

Thanks.

>   		if (ret)
>   			return ret;
>   
>   		next = pmd_addr_end(addr, end);
> +
> +		/*
> +		 * We are only splitting, not remapping the hugetlb vmemmap
> +		 * pages.
> +		 */
> +		if (!walk->remap_pte)
> +			continue;
> +
>   		vmemmap_pte_range(pmd, addr, next, walk);
>   	} while (pmd++, addr = next, addr != end);
>   
> @@ -198,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>   			return ret;
>   	} while (pgd++, addr = next, addr != end);
>   
> -	flush_tlb_kernel_range(start, end);
> +	if (walk->remap_pte)
> +		flush_tlb_kernel_range(start, end);
>   
>   	return 0;
>   }
> @@ -300,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>   	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>   }
>   
> +/**
> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
> + *                      backing PMDs of the directmap into PTEs
> + * @start:     start address of the vmemmap virtual address range that we want
> + *             to remap.
> + * @end:       end address of the vmemmap virtual address range that we want to
> + *             remap.
> + * @reuse:     reuse address.
> + *
> + * Return: %0 on success, negative error code otherwise.
> + */
> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
> +				unsigned long reuse)
> +{
> +	int ret;
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= NULL,
> +		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
> +	};
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	mmap_read_lock(&init_mm);
> +	ret = vmemmap_remap_range(reuse, end, &walk);
> +	mmap_read_unlock(&init_mm);
> +
> +	return ret;
> +}
> +
>   /**
>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>    *			to the page which @reuse is mapped to, then free vmemmap
> @@ -323,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_remap_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= vmemmap_pages,
> +		.flags		= 0,
>   	};
>   	int nid = page_to_nid((struct page *)reuse);
>   	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
> @@ -371,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   			.remap_pte	= vmemmap_restore_pte,
>   			.reuse_addr	= reuse,
>   			.vmemmap_pages	= vmemmap_pages,
> +			.flags		= 0,
>   		};
>   
>   		vmemmap_remap_range(reuse, end, &walk);
> @@ -422,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_restore_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= &vmemmap_pages,
> +		.flags		= 0,
>   	};
>   
>   	/* See the comment in the vmemmap_remap_free(). */
> @@ -630,11 +677,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   	free_vmemmap_page_list(&vmemmap_pages);
>   }
>   
> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> +{
> +	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
> +	unsigned long vmemmap_reuse;
> +
> +	if (!vmemmap_should_optimize(h, head))
> +		return;
> +
> +	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> +	vmemmap_reuse	= vmemmap_start;
> +	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
> +
> +	/*
> +	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
> +	 * @vmemmap_end]
> +	 */
> +	vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> +}
> +
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>   {
>   	struct folio *folio;
>   	LIST_HEAD(vmemmap_pages);
>   
> +	list_for_each_entry(folio, folio_list, lru)
> +		hugetlb_vmemmap_split(h, &folio->page);
> +
> +	flush_tlb_all();
> +
>   	list_for_each_entry(folio, folio_list, lru) {
>   		int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>   								&vmemmap_pages);
Muchun Song Sept. 19, 2023, 6:42 a.m. UTC | #2
On 2023/9/19 07:01, Mike Kravetz wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
>
> In an effort to minimize amount of TLB flushes, batch all PMD splits
> belonging to a range of pages in order to perform only 1 (global) TLB
> flush.
>
> Add a flags field to the walker and pass whether it's a bulk allocation
> or just a single page to decide to remap. First value
> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
> flush when we split the PMD.
>
> Rebased and updated by Mike Kravetz
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 75 insertions(+), 4 deletions(-)
>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 147ed15bcae4..e8bc2f7567db 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -27,6 +27,7 @@
>    * @reuse_addr:		the virtual address of the @reuse_page page.
>    * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
>    *			or is mapped from.
> + * @flags:		used to modify behavior in bulk operations

Better to describe it as "used to modify behavior in vmemmap page table 
walking operations"

>    */
>   struct vmemmap_remap_walk {
>   	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -35,9 +36,11 @@ struct vmemmap_remap_walk {
>   	struct page		*reuse_page;
>   	unsigned long		reuse_addr;
>   	struct list_head	*vmemmap_pages;
> +#define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
> +	unsigned long		flags;
>   };
>   
> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>   {
>   	pmd_t __pmd;
>   	int i;
> @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>   		/* Make pte visible before pmd. See comment in pmd_install(). */
>   		smp_wmb();
>   		pmd_populate_kernel(&init_mm, pmd, pgtable);
> -		flush_tlb_kernel_range(start, start + PMD_SIZE);
> +		if (flush)
> +			flush_tlb_kernel_range(start, start + PMD_SIZE);
>   	} else {
>   		pte_free_kernel(&init_mm, pgtable);
>   	}
> @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>   	do {
>   		int ret;
>   
> -		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
> +		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
> +				walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
>   		if (ret)
>   			return ret;
>   
>   		next = pmd_addr_end(addr, end);
> +
> +		/*
> +		 * We are only splitting, not remapping the hugetlb vmemmap
> +		 * pages.
> +		 */
> +		if (!walk->remap_pte)
> +			continue;
> +
>   		vmemmap_pte_range(pmd, addr, next, walk);
>   	} while (pmd++, addr = next, addr != end);
>   
> @@ -198,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>   			return ret;
>   	} while (pgd++, addr = next, addr != end);
>   
> -	flush_tlb_kernel_range(start, end);
> +	if (walk->remap_pte)
> +		flush_tlb_kernel_range(start, end);
>   
>   	return 0;
>   }
> @@ -300,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>   	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>   }
>   
> +/**
> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
> + *                      backing PMDs of the directmap into PTEs
> + * @start:     start address of the vmemmap virtual address range that we want
> + *             to remap.
> + * @end:       end address of the vmemmap virtual address range that we want to
> + *             remap.
> + * @reuse:     reuse address.
> + *
> + * Return: %0 on success, negative error code otherwise.
> + */
> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
> +				unsigned long reuse)
> +{
> +	int ret;
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= NULL,
> +		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
> +	};
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	mmap_read_lock(&init_mm);
> +	ret = vmemmap_remap_range(reuse, end, &walk);
> +	mmap_read_unlock(&init_mm);
> +
> +	return ret;
> +}
> +
>   /**
>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>    *			to the page which @reuse is mapped to, then free vmemmap
> @@ -323,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_remap_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= vmemmap_pages,
> +		.flags		= 0,
>   	};
>   	int nid = page_to_nid((struct page *)reuse);
>   	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
> @@ -371,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   			.remap_pte	= vmemmap_restore_pte,
>   			.reuse_addr	= reuse,
>   			.vmemmap_pages	= vmemmap_pages,
> +			.flags		= 0,
>   		};
>   
>   		vmemmap_remap_range(reuse, end, &walk);
> @@ -422,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_restore_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= &vmemmap_pages,
> +		.flags		= 0,
>   	};
>   
>   	/* See the comment in the vmemmap_remap_free(). */
> @@ -630,11 +677,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   	free_vmemmap_page_list(&vmemmap_pages);
>   }
>   
> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> +{
> +	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
> +	unsigned long vmemmap_reuse;
> +
> +	if (!vmemmap_should_optimize(h, head))
> +		return;
> +
> +	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> +	vmemmap_reuse	= vmemmap_start;
> +	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
> +
> +	/*
> +	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
> +	 * @vmemmap_end]
> +	 */
> +	vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> +}
> +
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>   {
>   	struct folio *folio;
>   	LIST_HEAD(vmemmap_pages);
>   
> +	list_for_each_entry(folio, folio_list, lru)
> +		hugetlb_vmemmap_split(h, &folio->page);
> +
> +	flush_tlb_all();
> +
>   	list_for_each_entry(folio, folio_list, lru) {
>   		int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>   								&vmemmap_pages);

This is unlikely to be failed since the page table allocation
is moved to the above (Note that the head vmemmap page allocation
is not mandatory). So we should handle the error case in the above
splitting operation.

Thanks.
Joao Martins Sept. 19, 2023, 8:18 a.m. UTC | #3
On 19/09/2023 07:27, Muchun Song wrote:
> On 2023/9/19 07:01, Mike Kravetz wrote:
>> From: Joao Martins <joao.m.martins@oracle.com>
>>
>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>> belonging to a range of pages in order to perform only 1 (global) TLB
>> flush.
>>
>> Add a flags field to the walker and pass whether it's a bulk allocation
>> or just a single page to decide to remap. First value
>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
>> flush when we split the PMD.
>>
>> Rebased and updated by Mike Kravetz
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>   mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 75 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index 147ed15bcae4..e8bc2f7567db 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -27,6 +27,7 @@
>>    * @reuse_addr:        the virtual address of the @reuse_page page.
>>    * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>    *            or is mapped from.
>> + * @flags:        used to modify behavior in bulk operations
>>    */
>>   struct vmemmap_remap_walk {
>>       void            (*remap_pte)(pte_t *pte, unsigned long addr,
>> @@ -35,9 +36,11 @@ struct vmemmap_remap_walk {
>>       struct page        *reuse_page;
>>       unsigned long        reuse_addr;
>>       struct list_head    *vmemmap_pages;
>> +#define VMEMMAP_SPLIT_NO_TLB_FLUSH    BIT(0)
> 
> Please add a brief comment following this macro to explain what's the
> behavior.
> 

/* Skip the TLB flush when we split the PMD */

And will also do it in the next patch with:

/* Skip the TLB flush when we remap the PTE */

>> +    unsigned long        flags;
>>   };
>>   -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>   {
>>       pmd_t __pmd;
>>       int i;
>> @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long
>> start)
>>           /* Make pte visible before pmd. See comment in pmd_install(). */
>>           smp_wmb();
>>           pmd_populate_kernel(&init_mm, pmd, pgtable);
>> -        flush_tlb_kernel_range(start, start + PMD_SIZE);
>> +        if (flush)
>> +            flush_tlb_kernel_range(start, start + PMD_SIZE);
>>       } else {
>>           pte_free_kernel(&init_mm, pgtable);
>>       }
>> @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long
>> addr,
>>       do {
>>           int ret;
>>   -        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>> +        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>> +                walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
> 
> !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)?
> 
Yeah -- Gah, I must be very distracted.

Thanks
Joao Martins Sept. 19, 2023, 8:26 a.m. UTC | #4
On 19/09/2023 07:42, Muchun Song wrote:
> On 2023/9/19 07:01, Mike Kravetz wrote:
>> From: Joao Martins <joao.m.martins@oracle.com>
>>
>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>> belonging to a range of pages in order to perform only 1 (global) TLB
>> flush.
>>
>> Add a flags field to the walker and pass whether it's a bulk allocation
>> or just a single page to decide to remap. First value
>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
>> flush when we split the PMD.
>>
>> Rebased and updated by Mike Kravetz
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>   mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 75 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index 147ed15bcae4..e8bc2f7567db 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -27,6 +27,7 @@
>>    * @reuse_addr:        the virtual address of the @reuse_page page.
>>    * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>    *            or is mapped from.
>> + * @flags:        used to modify behavior in bulk operations
> 
> Better to describe it as "used to modify behavior in vmemmap page table walking
> operations"
> 
OK

>>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>> *folio_list)
>>   {
>>       struct folio *folio;
>>       LIST_HEAD(vmemmap_pages);
>>   +    list_for_each_entry(folio, folio_list, lru)
>> +        hugetlb_vmemmap_split(h, &folio->page);
>> +
>> +    flush_tlb_all();
>> +
>>       list_for_each_entry(folio, folio_list, lru) {
>>           int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>                                   &vmemmap_pages);
> 
> This is unlikely to be failed since the page table allocation
> is moved to the above 

> (Note that the head vmemmap page allocation
> is not mandatory). 

Good point that I almost forgot

> So we should handle the error case in the above
> splitting operation.

But back to the previous discussion in v2... the thinking was that /some/ PMDs
got split, and say could allow some PTE remapping to occur and free some pages
back (each page allows 6 more splits worst case). Then the next
__hugetlb_vmemmap_optimize() will have to split PMD pages again for those
hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
flush in this stage).

Unless this isn't something worth handling

	Joao
Muchun Song Sept. 19, 2023, 8:41 a.m. UTC | #5
> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 19/09/2023 07:42, Muchun Song wrote:
>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>> From: Joao Martins <joao.m.martins@oracle.com>
>>> 
>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>> flush.
>>> 
>>> Add a flags field to the walker and pass whether it's a bulk allocation
>>> or just a single page to decide to remap. First value
>>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
>>> flush when we split the PMD.
>>> 
>>> Rebased and updated by Mike Kravetz
>>> 
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>  mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>>>  1 file changed, 75 insertions(+), 4 deletions(-)
>>> 
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index 147ed15bcae4..e8bc2f7567db 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -27,6 +27,7 @@
>>>   * @reuse_addr:        the virtual address of the @reuse_page page.
>>>   * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>>   *            or is mapped from.
>>> + * @flags:        used to modify behavior in bulk operations
>> 
>> Better to describe it as "used to modify behavior in vmemmap page table walking
>> operations"
>> 
> OK
> 
>>>  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>>> *folio_list)
>>>  {
>>>      struct folio *folio;
>>>      LIST_HEAD(vmemmap_pages);
>>>  +    list_for_each_entry(folio, folio_list, lru)
>>> +        hugetlb_vmemmap_split(h, &folio->page);
>>> +
>>> +    flush_tlb_all();
>>> +
>>>      list_for_each_entry(folio, folio_list, lru) {
>>>          int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>                                  &vmemmap_pages);
>> 
>> This is unlikely to be failed since the page table allocation
>> is moved to the above 
> 
>> (Note that the head vmemmap page allocation
>> is not mandatory). 
> 
> Good point that I almost forgot
> 
>> So we should handle the error case in the above
>> splitting operation.
> 
> But back to the previous discussion in v2... the thinking was that /some/ PMDs
> got split, and say could allow some PTE remapping to occur and free some pages
> back (each page allows 6 more splits worst case). Then the next
> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
> flush in this stage).

Oh, yes. Maybe we could break the above traversal as early as possible
once we enter an ENOMEM?

> 
> Unless this isn't something worth handling
> 
> Joao
Joao Martins Sept. 19, 2023, 8:55 a.m. UTC | #6
On 19/09/2023 09:41, Muchun Song wrote:
>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 19/09/2023 07:42, Muchun Song wrote:
>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>
>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>>> flush.
>>>>
>>>> Add a flags field to the walker and pass whether it's a bulk allocation
>>>> or just a single page to decide to remap. First value
>>>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
>>>> flush when we split the PMD.
>>>>
>>>> Rebased and updated by Mike Kravetz
>>>>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>> ---
>>>>  mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>>>>  1 file changed, 75 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>> index 147ed15bcae4..e8bc2f7567db 100644
>>>> --- a/mm/hugetlb_vmemmap.c
>>>> +++ b/mm/hugetlb_vmemmap.c
>>>> @@ -27,6 +27,7 @@
>>>>   * @reuse_addr:        the virtual address of the @reuse_page page.
>>>>   * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>>>   *            or is mapped from.
>>>> + * @flags:        used to modify behavior in bulk operations
>>>
>>> Better to describe it as "used to modify behavior in vmemmap page table walking
>>> operations"
>>>
>> OK
>>
>>>>  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>>>> *folio_list)
>>>>  {
>>>>      struct folio *folio;
>>>>      LIST_HEAD(vmemmap_pages);
>>>>  +    list_for_each_entry(folio, folio_list, lru)
>>>> +        hugetlb_vmemmap_split(h, &folio->page);
>>>> +
>>>> +    flush_tlb_all();
>>>> +
>>>>      list_for_each_entry(folio, folio_list, lru) {
>>>>          int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>                                  &vmemmap_pages);
>>>
>>> This is unlikely to be failed since the page table allocation
>>> is moved to the above 
>>
>>> (Note that the head vmemmap page allocation
>>> is not mandatory). 
>>
>> Good point that I almost forgot
>>
>>> So we should handle the error case in the above
>>> splitting operation.
>>
>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>> got split, and say could allow some PTE remapping to occur and free some pages
>> back (each page allows 6 more splits worst case). Then the next
>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>> flush in this stage).
> 
> Oh, yes. Maybe we could break the above traversal as early as possible
> once we enter an ENOMEM?
> 

Sounds good -- no point in keep trying to split if we are failing with OOM.

Perhaps a comment in both of these clauses (the early break on split and the OOM
handling in batch optimize) could help make this clear.

>>
>> Unless this isn't something worth handling
>>
>> Joao
> 
> 
>
Muchun Song Sept. 19, 2023, 8:57 a.m. UTC | #7
> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 19/09/2023 09:41, Muchun Song wrote:
>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 19/09/2023 07:42, Muchun Song wrote:
>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>> 
>>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>>>> flush.
>>>>> 
>>>>> Add a flags field to the walker and pass whether it's a bulk allocation
>>>>> or just a single page to decide to remap. First value
>>>>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
>>>>> flush when we split the PMD.
>>>>> 
>>>>> Rebased and updated by Mike Kravetz
>>>>> 
>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>> ---
>>>>> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++---
>>>>> 1 file changed, 75 insertions(+), 4 deletions(-)
>>>>> 
>>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>>> index 147ed15bcae4..e8bc2f7567db 100644
>>>>> --- a/mm/hugetlb_vmemmap.c
>>>>> +++ b/mm/hugetlb_vmemmap.c
>>>>> @@ -27,6 +27,7 @@
>>>>>  * @reuse_addr:        the virtual address of the @reuse_page page.
>>>>>  * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>>>>  *            or is mapped from.
>>>>> + * @flags:        used to modify behavior in bulk operations
>>>> 
>>>> Better to describe it as "used to modify behavior in vmemmap page table walking
>>>> operations"
>>>> 
>>> OK
>>> 
>>>>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>>>>> *folio_list)
>>>>> {
>>>>>     struct folio *folio;
>>>>>     LIST_HEAD(vmemmap_pages);
>>>>> +    list_for_each_entry(folio, folio_list, lru)
>>>>> +        hugetlb_vmemmap_split(h, &folio->page);
>>>>> +
>>>>> +    flush_tlb_all();
>>>>> +
>>>>>     list_for_each_entry(folio, folio_list, lru) {
>>>>>         int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>>                                 &vmemmap_pages);
>>>> 
>>>> This is unlikely to be failed since the page table allocation
>>>> is moved to the above 
>>> 
>>>> (Note that the head vmemmap page allocation
>>>> is not mandatory). 
>>> 
>>> Good point that I almost forgot
>>> 
>>>> So we should handle the error case in the above
>>>> splitting operation.
>>> 
>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>>> got split, and say could allow some PTE remapping to occur and free some pages
>>> back (each page allows 6 more splits worst case). Then the next
>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>>> flush in this stage).
>> 
>> Oh, yes. Maybe we could break the above traversal as early as possible
>> once we enter an ENOMEM?
>> 
> 
> Sounds good -- no point in keep trying to split if we are failing with OOM.
> 
> Perhaps a comment in both of these clauses (the early break on split and the OOM
> handling in batch optimize) could help make this clear.

Make sense.

Thanks.

> 
>>> 
>>> Unless this isn't something worth handling
>>> 
>>> Joao
Joao Martins Sept. 19, 2023, 3:09 p.m. UTC | #8
On 19/09/2023 09:57, Muchun Song wrote:
>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 19/09/2023 09:41, Muchun Song wrote:
>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> On 19/09/2023 07:42, Muchun Song wrote:
>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>>     list_for_each_entry(folio, folio_list, lru) {
>>>>>>         int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>>>                                 &vmemmap_pages);
>>>>>
>>>>> This is unlikely to be failed since the page table allocation
>>>>> is moved to the above 
>>>>
>>>>> (Note that the head vmemmap page allocation
>>>>> is not mandatory). 
>>>>
>>>> Good point that I almost forgot
>>>>
>>>>> So we should handle the error case in the above
>>>>> splitting operation.
>>>>
>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>>>> got split, and say could allow some PTE remapping to occur and free some pages
>>>> back (each page allows 6 more splits worst case). Then the next
>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>>>> flush in this stage).
>>>
>>> Oh, yes. Maybe we could break the above traversal as early as possible
>>> once we enter an ENOMEM?
>>>
>>
>> Sounds good -- no point in keep trying to split if we are failing with OOM.
>>
>> Perhaps a comment in both of these clauses (the early break on split and the OOM
>> handling in batch optimize) could help make this clear.
> 
> Make sense.

These are the changes I have so far for this patch based on the discussion so
far. For next one it's at the end:

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e8bc2f7567db..d9c6f2cf698c 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -27,7 +27,8 @@
  * @reuse_addr:                the virtual address of the @reuse_page page.
  * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
  *                     or is mapped from.
- * @flags:             used to modify behavior in bulk operations
+ * @flags:             used to modify behavior in vmemmap page table walking
+ *                     operations.
  */
 struct vmemmap_remap_walk {
        void                    (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -36,6 +37,8 @@ struct vmemmap_remap_walk {
        struct page             *reuse_page;
        unsigned long           reuse_addr;
        struct list_head        *vmemmap_pages;
+
+/* Skip the TLB flush when we split the PMD */
 #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
        unsigned long           flags;
 };
@@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
                int ret;

                ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
-                               walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
+                               !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
                if (ret)
                        return ret;

@@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h,
struct page *head)
        free_vmemmap_page_list(&vmemmap_pages);
 }

-static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
+static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
 {
        unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
        unsigned long vmemmap_reuse;

        if (!vmemmap_should_optimize(h, head))
-               return;
+               return 0;

        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
        vmemmap_reuse   = vmemmap_start;
@@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h,
struct page *head)
         * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
         * @vmemmap_end]
         */
-       vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
+       return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 }

 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
*folio_list)
@@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
struct list_head *folio_l
        struct folio *folio;
        LIST_HEAD(vmemmap_pages);

-       list_for_each_entry(folio, folio_list, lru)
-               hugetlb_vmemmap_split(h, &folio->page);
+       list_for_each_entry(folio, folio_list, lru) {
+               int ret = hugetlb_vmemmap_split(h, &folio->page);
+
+               /*
+                * Spliting the PMD requires allocating a page, thus lets fail
+                * early once we encounter the first OOM. No point in retrying
+                * as it can be dynamically done on remap with the memory
+                * we get back from the vmemmap deduplication.
+                */
+               if (ret == -ENOMEM)
+                       break;
+       }

        flush_tlb_all();

For patch 7, I only have commentary added derived from this earlier discussion
above:

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d9c6f2cf698c..f6a1020a4b6a 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -40,6 +40,8 @@ struct vmemmap_remap_walk {

/* Skip the TLB flush when we split the PMD */
#define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
+/* Skip the TLB flush when we remap the PTE */
#define VMEMMAP_REMAP_NO_TLB_FLUSH     BIT(1)
        unsigned long           flags;
 };

@@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
struct list_head *folio_l

        list_for_each_entry(folio, folio_list, lru) {
                int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
                                               &vmemmap_pages,
                                               VMEMMAP_REMAP_NO_TLB_FLUSH);

                /*
                 * Pages to be freed may have been accumulated.  If we
                 * encounter an ENOMEM,  free what we have and try again.
+                * This can occur in the case that both spliting fails
+                * halfway and head page allocation also failed. In this
+                * case __hugetlb_vmemmap_optimize() would free memory
+                * allowing more vmemmap remaps to occur.
                 */
                if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
Muchun Song Sept. 20, 2023, 2:47 a.m. UTC | #9
> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 19/09/2023 09:57, Muchun Song wrote:
>>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 19/09/2023 09:41, Muchun Song wrote:
>>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> On 19/09/2023 07:42, Muchun Song wrote:
>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>>>    list_for_each_entry(folio, folio_list, lru) {
>>>>>>>        int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>>>>                                &vmemmap_pages);
>>>>>> 
>>>>>> This is unlikely to be failed since the page table allocation
>>>>>> is moved to the above 
>>>>> 
>>>>>> (Note that the head vmemmap page allocation
>>>>>> is not mandatory). 
>>>>> 
>>>>> Good point that I almost forgot
>>>>> 
>>>>>> So we should handle the error case in the above
>>>>>> splitting operation.
>>>>> 
>>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>>>>> got split, and say could allow some PTE remapping to occur and free some pages
>>>>> back (each page allows 6 more splits worst case). Then the next
>>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>>>>> flush in this stage).
>>>> 
>>>> Oh, yes. Maybe we could break the above traversal as early as possible
>>>> once we enter an ENOMEM?
>>>> 
>>> 
>>> Sounds good -- no point in keep trying to split if we are failing with OOM.
>>> 
>>> Perhaps a comment in both of these clauses (the early break on split and the OOM
>>> handling in batch optimize) could help make this clear.
>> 
>> Make sense.
> 
> These are the changes I have so far for this patch based on the discussion so
> far. For next one it's at the end:

Code looks good to me. One nit below.

> 
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index e8bc2f7567db..d9c6f2cf698c 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -27,7 +27,8 @@
>  * @reuse_addr:                the virtual address of the @reuse_page page.
>  * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
>  *                     or is mapped from.
> - * @flags:             used to modify behavior in bulk operations
> + * @flags:             used to modify behavior in vmemmap page table walking
> + *                     operations.
>  */
> struct vmemmap_remap_walk {
>        void                    (*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -36,6 +37,8 @@ struct vmemmap_remap_walk {
>        struct page             *reuse_page;
>        unsigned long           reuse_addr;
>        struct list_head        *vmemmap_pages;
> +
> +/* Skip the TLB flush when we split the PMD */
> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
>        unsigned long           flags;
> };
> @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>                int ret;
> 
>                ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
> -                               walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
> +                               !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
>                if (ret)
>                        return ret;
> 
> @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h,
> struct page *head)
>        free_vmemmap_page_list(&vmemmap_pages);
> }
> 
> -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> {
>        unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>        unsigned long vmemmap_reuse;
> 
>        if (!vmemmap_should_optimize(h, head))
> -               return;
> +               return 0;
> 
>        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>        vmemmap_reuse   = vmemmap_start;
> @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h,
> struct page *head)
>         * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>         * @vmemmap_end]
>         */
> -       vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> +       return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> }
> 
> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
> *folio_list)
> @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
> struct list_head *folio_l
>        struct folio *folio;
>        LIST_HEAD(vmemmap_pages);
> 
> -       list_for_each_entry(folio, folio_list, lru)
> -               hugetlb_vmemmap_split(h, &folio->page);
> +       list_for_each_entry(folio, folio_list, lru) {
> +               int ret = hugetlb_vmemmap_split(h, &folio->page);
> +
> +               /*
> +                * Spliting the PMD requires allocating a page, thus lets fail
                      ^^^^                                 ^^^
                    Splitting                           page table page

I'd like to specify the functionality of the allocated page.

> +                * early once we encounter the first OOM. No point in retrying
> +                * as it can be dynamically done on remap with the memory
> +                * we get back from the vmemmap deduplication.
> +                */
> +               if (ret == -ENOMEM)
> +                       break;
> +       }
> 
>        flush_tlb_all();
> 
> For patch 7, I only have commentary added derived from this earlier discussion
> above:
> 
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index d9c6f2cf698c..f6a1020a4b6a 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -40,6 +40,8 @@ struct vmemmap_remap_walk {
> 
> /* Skip the TLB flush when we split the PMD */
> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
> +/* Skip the TLB flush when we remap the PTE */
> #define VMEMMAP_REMAP_NO_TLB_FLUSH     BIT(1)
>        unsigned long           flags;
> };
> 
> @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
> struct list_head *folio_l
> 
>        list_for_each_entry(folio, folio_list, lru) {
>                int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>                                               &vmemmap_pages,
>                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
> 
>                /*
>                 * Pages to be freed may have been accumulated.  If we
>                 * encounter an ENOMEM,  free what we have and try again.
> +                * This can occur in the case that both spliting fails
                                                            ^^^
                                                         splitting

> +                * halfway and head page allocation also failed. In this
                                 ^^^^^^^
                            head vmemmap page

Otherwise:

Reviewed-by: Muchun Song <songmuchun@bytedance.com>

Thanks.

> +                * case __hugetlb_vmemmap_optimize() would free memory
> +                * allowing more vmemmap remaps to occur.
>                 */
>                if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
>
Joao Martins Sept. 20, 2023, 10:39 a.m. UTC | #10
On 20/09/2023 03:47, Muchun Song wrote:
>> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 19/09/2023 09:57, Muchun Song wrote:
>>>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> On 19/09/2023 09:41, Muchun Song wrote:
>>>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>> On 19/09/2023 07:42, Muchun Song wrote:
>>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>>>>    list_for_each_entry(folio, folio_list, lru) {
>>>>>>>>        int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>>>>>                                &vmemmap_pages);
>>>>>>>
>>>>>>> This is unlikely to be failed since the page table allocation
>>>>>>> is moved to the above 
>>>>>>
>>>>>>> (Note that the head vmemmap page allocation
>>>>>>> is not mandatory). 
>>>>>>
>>>>>> Good point that I almost forgot
>>>>>>
>>>>>>> So we should handle the error case in the above
>>>>>>> splitting operation.
>>>>>>
>>>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>>>>>> got split, and say could allow some PTE remapping to occur and free some pages
>>>>>> back (each page allows 6 more splits worst case). Then the next
>>>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>>>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>>>>>> flush in this stage).
>>>>>
>>>>> Oh, yes. Maybe we could break the above traversal as early as possible
>>>>> once we enter an ENOMEM?
>>>>>
>>>>
>>>> Sounds good -- no point in keep trying to split if we are failing with OOM.
>>>>
>>>> Perhaps a comment in both of these clauses (the early break on split and the OOM
>>>> handling in batch optimize) could help make this clear.
>>>
>>> Make sense.
>>
>> These are the changes I have so far for this patch based on the discussion so
>> far. For next one it's at the end:
> 
> Code looks good to me. One nit below.
> 
Thanks

>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index e8bc2f7567db..d9c6f2cf698c 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -27,7 +27,8 @@
>>  * @reuse_addr:                the virtual address of the @reuse_page page.
>>  * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
>>  *                     or is mapped from.
>> - * @flags:             used to modify behavior in bulk operations
>> + * @flags:             used to modify behavior in vmemmap page table walking
>> + *                     operations.
>>  */
>> struct vmemmap_remap_walk {
>>        void                    (*remap_pte)(pte_t *pte, unsigned long addr,
>> @@ -36,6 +37,8 @@ struct vmemmap_remap_walk {
>>        struct page             *reuse_page;
>>        unsigned long           reuse_addr;
>>        struct list_head        *vmemmap_pages;
>> +
>> +/* Skip the TLB flush when we split the PMD */
>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
>>        unsigned long           flags;
>> };
>> @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>                int ret;
>>
>>                ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>> -                               walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
>> +                               !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
>>                if (ret)
>>                        return ret;
>>
>> @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h,
>> struct page *head)
>>        free_vmemmap_page_list(&vmemmap_pages);
>> }
>>
>> -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>> +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>> {
>>        unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>        unsigned long vmemmap_reuse;
>>
>>        if (!vmemmap_should_optimize(h, head))
>> -               return;
>> +               return 0;
>>
>>        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>        vmemmap_reuse   = vmemmap_start;
>> @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h,
>> struct page *head)
>>         * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>         * @vmemmap_end]
>>         */
>> -       vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>> +       return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>> }
>>
>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>> *folio_list)
>> @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
>> struct list_head *folio_l
>>        struct folio *folio;
>>        LIST_HEAD(vmemmap_pages);
>>
>> -       list_for_each_entry(folio, folio_list, lru)
>> -               hugetlb_vmemmap_split(h, &folio->page);
>> +       list_for_each_entry(folio, folio_list, lru) {
>> +               int ret = hugetlb_vmemmap_split(h, &folio->page);
>> +
>> +               /*
>> +                * Spliting the PMD requires allocating a page, thus lets fail
>                       ^^^^                                 ^^^
>                     Splitting                           page table page
> 
> I'd like to specify the functionality of the allocated page.
> 
OK

>> +                * early once we encounter the first OOM. No point in retrying
>> +                * as it can be dynamically done on remap with the memory
>> +                * we get back from the vmemmap deduplication.
>> +                */
>> +               if (ret == -ENOMEM)
>> +                       break;
>> +       }
>>
>>        flush_tlb_all();
>>
>> For patch 7, I only have commentary added derived from this earlier discussion
>> above:
>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index d9c6f2cf698c..f6a1020a4b6a 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -40,6 +40,8 @@ struct vmemmap_remap_walk {
>>
>> /* Skip the TLB flush when we split the PMD */
>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
>> +/* Skip the TLB flush when we remap the PTE */
>> #define VMEMMAP_REMAP_NO_TLB_FLUSH     BIT(1)
>>        unsigned long           flags;
>> };
>>
>> @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
>> struct list_head *folio_l
>>
>>        list_for_each_entry(folio, folio_list, lru) {
>>                int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>                                               &vmemmap_pages,
>>                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
>>
>>                /*
>>                 * Pages to be freed may have been accumulated.  If we
>>                 * encounter an ENOMEM,  free what we have and try again.
>> +                * This can occur in the case that both spliting fails
>                                                             ^^^
>                                                          splitting
> 

ok

>> +                * halfway and head page allocation also failed. In this
>                                  ^^^^^^^
>                             head vmemmap page
> 
ok

> Otherwise:
> 
> Reviewed-by: Muchun Song <songmuchun@bytedance.com>
> 

Thanks, I assume that's for both patches?

> Thanks.
> 
>> +                * case __hugetlb_vmemmap_optimize() would free memory
>> +                * allowing more vmemmap remaps to occur.
>>                 */
>>                if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
>>
> 
>
Muchun Song Sept. 21, 2023, 1:42 a.m. UTC | #11
> On Sep 20, 2023, at 18:39, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 20/09/2023 03:47, Muchun Song wrote:
>>> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 19/09/2023 09:57, Muchun Song wrote:
>>>>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> On 19/09/2023 09:41, Muchun Song wrote:
>>>>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>> On 19/09/2023 07:42, Muchun Song wrote:
>>>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>>>>>   list_for_each_entry(folio, folio_list, lru) {
>>>>>>>>>       int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>>>>>>>                               &vmemmap_pages);
>>>>>>>> 
>>>>>>>> This is unlikely to be failed since the page table allocation
>>>>>>>> is moved to the above 
>>>>>>> 
>>>>>>>> (Note that the head vmemmap page allocation
>>>>>>>> is not mandatory). 
>>>>>>> 
>>>>>>> Good point that I almost forgot
>>>>>>> 
>>>>>>>> So we should handle the error case in the above
>>>>>>>> splitting operation.
>>>>>>> 
>>>>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs
>>>>>>> got split, and say could allow some PTE remapping to occur and free some pages
>>>>>>> back (each page allows 6 more splits worst case). Then the next
>>>>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those
>>>>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb
>>>>>>> flush in this stage).
>>>>>> 
>>>>>> Oh, yes. Maybe we could break the above traversal as early as possible
>>>>>> once we enter an ENOMEM?
>>>>>> 
>>>>> 
>>>>> Sounds good -- no point in keep trying to split if we are failing with OOM.
>>>>> 
>>>>> Perhaps a comment in both of these clauses (the early break on split and the OOM
>>>>> handling in batch optimize) could help make this clear.
>>>> 
>>>> Make sense.
>>> 
>>> These are the changes I have so far for this patch based on the discussion so
>>> far. For next one it's at the end:
>> 
>> Code looks good to me. One nit below.
>> 
> Thanks
> 
>>> 
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index e8bc2f7567db..d9c6f2cf698c 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -27,7 +27,8 @@
>>> * @reuse_addr:                the virtual address of the @reuse_page page.
>>> * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
>>> *                     or is mapped from.
>>> - * @flags:             used to modify behavior in bulk operations
>>> + * @flags:             used to modify behavior in vmemmap page table walking
>>> + *                     operations.
>>> */
>>> struct vmemmap_remap_walk {
>>>       void                    (*remap_pte)(pte_t *pte, unsigned long addr,
>>> @@ -36,6 +37,8 @@ struct vmemmap_remap_walk {
>>>       struct page             *reuse_page;
>>>       unsigned long           reuse_addr;
>>>       struct list_head        *vmemmap_pages;
>>> +
>>> +/* Skip the TLB flush when we split the PMD */
>>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
>>>       unsigned long           flags;
>>> };
>>> @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>>               int ret;
>>> 
>>>               ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>>> -                               walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
>>> +                               !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
>>>               if (ret)
>>>                       return ret;
>>> 
>>> @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h,
>>> struct page *head)
>>>       free_vmemmap_page_list(&vmemmap_pages);
>>> }
>>> 
>>> -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>> +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>> {
>>>       unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>>       unsigned long vmemmap_reuse;
>>> 
>>>       if (!vmemmap_should_optimize(h, head))
>>> -               return;
>>> +               return 0;
>>> 
>>>       vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>>       vmemmap_reuse   = vmemmap_start;
>>> @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h,
>>> struct page *head)
>>>        * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>>        * @vmemmap_end]
>>>        */
>>> -       vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>> +       return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>> }
>>> 
>>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>>> *folio_list)
>>> @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
>>> struct list_head *folio_l
>>>       struct folio *folio;
>>>       LIST_HEAD(vmemmap_pages);
>>> 
>>> -       list_for_each_entry(folio, folio_list, lru)
>>> -               hugetlb_vmemmap_split(h, &folio->page);
>>> +       list_for_each_entry(folio, folio_list, lru) {
>>> +               int ret = hugetlb_vmemmap_split(h, &folio->page);
>>> +
>>> +               /*
>>> +                * Spliting the PMD requires allocating a page, thus lets fail
>>                      ^^^^                                 ^^^
>>                    Splitting                           page table page
>> 
>> I'd like to specify the functionality of the allocated page.
>> 
> OK
> 
>>> +                * early once we encounter the first OOM. No point in retrying
>>> +                * as it can be dynamically done on remap with the memory
>>> +                * we get back from the vmemmap deduplication.
>>> +                */
>>> +               if (ret == -ENOMEM)
>>> +                       break;
>>> +       }
>>> 
>>>       flush_tlb_all();
>>> 
>>> For patch 7, I only have commentary added derived from this earlier discussion
>>> above:
>>> 
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index d9c6f2cf698c..f6a1020a4b6a 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -40,6 +40,8 @@ struct vmemmap_remap_walk {
>>> 
>>> /* Skip the TLB flush when we split the PMD */
>>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
>>> +/* Skip the TLB flush when we remap the PTE */
>>> #define VMEMMAP_REMAP_NO_TLB_FLUSH     BIT(1)
>>>       unsigned long           flags;
>>> };
>>> 
>>> @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h,
>>> struct list_head *folio_l
>>> 
>>>       list_for_each_entry(folio, folio_list, lru) {
>>>               int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
>>>                                              &vmemmap_pages,
>>>                                              VMEMMAP_REMAP_NO_TLB_FLUSH);
>>> 
>>>               /*
>>>                * Pages to be freed may have been accumulated.  If we
>>>                * encounter an ENOMEM,  free what we have and try again.
>>> +                * This can occur in the case that both spliting fails
>>                                                            ^^^
>>                                                         splitting
>> 
> 
> ok
> 
>>> +                * halfway and head page allocation also failed. In this
>>                                 ^^^^^^^
>>                            head vmemmap page
>> 
> ok
> 
>> Otherwise:
>> 
>> Reviewed-by: Muchun Song <songmuchun@bytedance.com>
>> 
> 
> Thanks, I assume that's for both patches?

Yes. Thanks.

> 
>> Thanks.
>> 
>>> +                * case __hugetlb_vmemmap_optimize() would free memory
>>> +                * allowing more vmemmap remaps to occur.
>>>                */
>>>               if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
diff mbox series

Patch

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 147ed15bcae4..e8bc2f7567db 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -27,6 +27,7 @@ 
  * @reuse_addr:		the virtual address of the @reuse_page page.
  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
  *			or is mapped from.
+ * @flags:		used to modify behavior in bulk operations
  */
 struct vmemmap_remap_walk {
 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
@@ -35,9 +36,11 @@  struct vmemmap_remap_walk {
 	struct page		*reuse_page;
 	unsigned long		reuse_addr;
 	struct list_head	*vmemmap_pages;
+#define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
+	unsigned long		flags;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
 {
 	pmd_t __pmd;
 	int i;
@@ -80,7 +83,8 @@  static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 		/* Make pte visible before pmd. See comment in pmd_install(). */
 		smp_wmb();
 		pmd_populate_kernel(&init_mm, pmd, pgtable);
-		flush_tlb_kernel_range(start, start + PMD_SIZE);
+		if (flush)
+			flush_tlb_kernel_range(start, start + PMD_SIZE);
 	} else {
 		pte_free_kernel(&init_mm, pgtable);
 	}
@@ -127,11 +131,20 @@  static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 	do {
 		int ret;
 
-		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
+				walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH);
 		if (ret)
 			return ret;
 
 		next = pmd_addr_end(addr, end);
+
+		/*
+		 * We are only splitting, not remapping the hugetlb vmemmap
+		 * pages.
+		 */
+		if (!walk->remap_pte)
+			continue;
+
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
 
@@ -198,7 +211,8 @@  static int vmemmap_remap_range(unsigned long start, unsigned long end,
 			return ret;
 	} while (pgd++, addr = next, addr != end);
 
-	flush_tlb_kernel_range(start, end);
+	if (walk->remap_pte)
+		flush_tlb_kernel_range(start, end);
 
 	return 0;
 }
@@ -300,6 +314,36 @@  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
 
+/**
+ * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
+ *                      backing PMDs of the directmap into PTEs
+ * @start:     start address of the vmemmap virtual address range that we want
+ *             to remap.
+ * @end:       end address of the vmemmap virtual address range that we want to
+ *             remap.
+ * @reuse:     reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_split(unsigned long start, unsigned long end,
+				unsigned long reuse)
+{
+	int ret;
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= NULL,
+		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	mmap_read_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
+
+	return ret;
+}
+
 /**
  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  *			to the page which @reuse is mapped to, then free vmemmap
@@ -323,6 +367,7 @@  static int vmemmap_remap_free(unsigned long start, unsigned long end,
 		.remap_pte	= vmemmap_remap_pte,
 		.reuse_addr	= reuse,
 		.vmemmap_pages	= vmemmap_pages,
+		.flags		= 0,
 	};
 	int nid = page_to_nid((struct page *)reuse);
 	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
@@ -371,6 +416,7 @@  static int vmemmap_remap_free(unsigned long start, unsigned long end,
 			.remap_pte	= vmemmap_restore_pte,
 			.reuse_addr	= reuse,
 			.vmemmap_pages	= vmemmap_pages,
+			.flags		= 0,
 		};
 
 		vmemmap_remap_range(reuse, end, &walk);
@@ -422,6 +468,7 @@  static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 		.remap_pte	= vmemmap_restore_pte,
 		.reuse_addr	= reuse,
 		.vmemmap_pages	= &vmemmap_pages,
+		.flags		= 0,
 	};
 
 	/* See the comment in the vmemmap_remap_free(). */
@@ -630,11 +677,35 @@  void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
+
+	if (!vmemmap_should_optimize(h, head))
+		return;
+
+	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse	= vmemmap_start;
+	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+	/*
+	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
+	 * @vmemmap_end]
+	 */
+	vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
+}
+
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 {
 	struct folio *folio;
 	LIST_HEAD(vmemmap_pages);
 
+	list_for_each_entry(folio, folio_list, lru)
+		hugetlb_vmemmap_split(h, &folio->page);
+
+	flush_tlb_all();
+
 	list_for_each_entry(folio, folio_list, lru) {
 		int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
 								&vmemmap_pages);