diff mbox series

[v2,09/11] hugetlb: batch PMD split for bulk vmemmap dedup

Message ID 20230905214412.89152-10-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Batch hugetlb vmemmap modification operations | expand

Commit Message

Mike Kravetz Sept. 5, 2023, 9:44 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

In an effort to minimize amount of TLB flushes, batch all PMD splits
belonging to a range of pages in order to perform only 1 (global) TLB
flush.

Rebased and updated by Mike Kravetz

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 4 deletions(-)

Comments

Muchun Song Sept. 6, 2023, 8:24 a.m. UTC | #1
On 2023/9/6 05:44, Mike Kravetz wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
>
> In an effort to minimize amount of TLB flushes, batch all PMD splits
> belonging to a range of pages in order to perform only 1 (global) TLB
> flush.
>
> Rebased and updated by Mike Kravetz
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 68 insertions(+), 4 deletions(-)
>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index a715712df831..d956551699bc 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>   	struct list_head	*vmemmap_pages;
>   };
>   
> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>   {
>   	pmd_t __pmd;
>   	int i;
> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>   		/* Make pte visible before pmd. See comment in pmd_install(). */
>   		smp_wmb();
>   		pmd_populate_kernel(&init_mm, pmd, pgtable);
> -		flush_tlb_kernel_range(start, start + PMD_SIZE);
> +		if (flush)
> +			flush_tlb_kernel_range(start, start + PMD_SIZE);
>   	} else {
>   		pte_free_kernel(&init_mm, pgtable);
>   	}
> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>   	do {
>   		int ret;
>   
> -		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
> +		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
> +				walk->remap_pte != NULL);

It is bettter to only make @walk->remap_pte indicate whether we should go
to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
to indicate whether we should flush the TLB at pmd level. It'll be more 
clear.

>   		if (ret)
>   			return ret;
>   
>   		next = pmd_addr_end(addr, end);
> +
> +		/*
> +		 * We are only splitting, not remapping the hugetlb vmemmap
> +		 * pages.
> +		 */
> +		if (!walk->remap_pte)
> +			continue;
> +
>   		vmemmap_pte_range(pmd, addr, next, walk);
>   	} while (pmd++, addr = next, addr != end);
>   
> @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>   			return ret;
>   	} while (pgd++, addr = next, addr != end);
>   
> -	flush_tlb_kernel_range(start, end);
> +	if (walk->remap_pte)
> +		flush_tlb_kernel_range(start, end);
>   
>   	return 0;
>   }
> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>   	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>   }
>   
> +/**
> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
> + *                      backing PMDs of the directmap into PTEs
> + * @start:     start address of the vmemmap virtual address range that we want
> + *             to remap.
> + * @end:       end address of the vmemmap virtual address range that we want to
> + *             remap.
> + * @reuse:     reuse address.
> + *
> + * Return: %0 on success, negative error code otherwise.
> + */
> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
> +				unsigned long reuse)
> +{
> +	int ret;
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= NULL,
> +	};
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	mmap_read_lock(&init_mm);
> +	ret = vmemmap_remap_range(reuse, end, &walk);
> +	mmap_read_unlock(&init_mm);
> +
> +	return ret;
> +}
> +
>   /**
>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>    *			to the page which @reuse is mapped to, then free vmemmap
> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   	free_vmemmap_page_list(&vmemmap_pages);
>   }
>   
> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> +{
> +	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
> +	unsigned long vmemmap_reuse;
> +
> +	if (!vmemmap_should_optimize(h, head))
> +		return;
> +
> +	vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
> +	vmemmap_reuse   = vmemmap_start;
> +	vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
> +
> +	/*
> +	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
> +	 * @vmemmap_end]
> +	 */
> +	vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> +}
> +
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>   {
>   	struct folio *folio;
>   	LIST_HEAD(vmemmap_pages);
>   
> +	list_for_each_entry(folio, folio_list, lru)
> +		hugetlb_vmemmap_split(h, &folio->page);

Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
to indicate whether it has done successfully, if it fails, it must be
OOM, in which case, there is no sense to continue to split the page talbe
and optimize the vmemmap pages subsequently, right?

Thanks.

> +
> +	flush_tlb_all();
> +
>   	list_for_each_entry(folio, folio_list, lru)
>   		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
>
Muchun Song Sept. 6, 2023, 9:11 a.m. UTC | #2
On Wed, Sep 6, 2023 at 4:25 PM Muchun Song <muchun.song@linux.dev> wrote:
>
>
>
> On 2023/9/6 05:44, Mike Kravetz wrote:
> > From: Joao Martins <joao.m.martins@oracle.com>
> >
> > In an effort to minimize amount of TLB flushes, batch all PMD splits
> > belonging to a range of pages in order to perform only 1 (global) TLB
> > flush.
> >
> > Rebased and updated by Mike Kravetz
> >
> > Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >   mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
> >   1 file changed, 68 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index a715712df831..d956551699bc 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
> >       struct list_head        *vmemmap_pages;
> >   };
> >
> > -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> > +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
> >   {
> >       pmd_t __pmd;
> >       int i;
> > @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> >               /* Make pte visible before pmd. See comment in pmd_install(). */
> >               smp_wmb();
> >               pmd_populate_kernel(&init_mm, pmd, pgtable);
> > -             flush_tlb_kernel_range(start, start + PMD_SIZE);
> > +             if (flush)
> > +                     flush_tlb_kernel_range(start, start + PMD_SIZE);
> >       } else {
> >               pte_free_kernel(&init_mm, pgtable);
> >       }
> > @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
> >       do {
> >               int ret;
> >
> > -             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
> > +             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
> > +                             walk->remap_pte != NULL);
>
> It is bettter to only make @walk->remap_pte indicate whether we should go
> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
> to indicate whether we should flush the TLB at pmd level. It'll be more
> clear.
>
> >               if (ret)
> >                       return ret;
> >
> >               next = pmd_addr_end(addr, end);
> > +
> > +             /*
> > +              * We are only splitting, not remapping the hugetlb vmemmap
> > +              * pages.
> > +              */
> > +             if (!walk->remap_pte)
> > +                     continue;
> > +
> >               vmemmap_pte_range(pmd, addr, next, walk);
> >       } while (pmd++, addr = next, addr != end);
> >
> > @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
> >                       return ret;
> >       } while (pgd++, addr = next, addr != end);
> >
> > -     flush_tlb_kernel_range(start, end);
> > +     if (walk->remap_pte)
> > +             flush_tlb_kernel_range(start, end);
> >
> >       return 0;
> >   }
> > @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> >       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> >   }
> >
> > +/**
> > + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
> > + *                      backing PMDs of the directmap into PTEs
> > + * @start:     start address of the vmemmap virtual address range that we want
> > + *             to remap.
> > + * @end:       end address of the vmemmap virtual address range that we want to
> > + *             remap.
> > + * @reuse:     reuse address.
> > + *
> > + * Return: %0 on success, negative error code otherwise.
> > + */
> > +static int vmemmap_remap_split(unsigned long start, unsigned long end,
> > +                             unsigned long reuse)
> > +{
> > +     int ret;
> > +     struct vmemmap_remap_walk walk = {
> > +             .remap_pte      = NULL,
> > +     };
> > +
> > +     /* See the comment in the vmemmap_remap_free(). */
> > +     BUG_ON(start - reuse != PAGE_SIZE);
> > +
> > +     mmap_read_lock(&init_mm);
> > +     ret = vmemmap_remap_range(reuse, end, &walk);
> > +     mmap_read_unlock(&init_mm);
> > +
> > +     return ret;
> > +}
> > +
> >   /**
> >    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
> >    *                  to the page which @reuse is mapped to, then free vmemmap
> > @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
> >       free_vmemmap_page_list(&vmemmap_pages);
> >   }
> >
> > +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> > +{
> > +     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
> > +     unsigned long vmemmap_reuse;
> > +
> > +     if (!vmemmap_should_optimize(h, head))
> > +             return;
> > +
> > +     vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
> > +     vmemmap_reuse   = vmemmap_start;
> > +     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
> > +
> > +     /*
> > +      * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
> > +      * @vmemmap_end]
> > +      */
> > +     vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> > +}
> > +
> >   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
> >   {
> >       struct folio *folio;
> >       LIST_HEAD(vmemmap_pages);
> >
> > +     list_for_each_entry(folio, folio_list, lru)
> > +             hugetlb_vmemmap_split(h, &folio->page);
>
> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
> to indicate whether it has done successfully, if it fails, it must be
> OOM, in which case, there is no sense to continue to split the page table
> and optimize the vmemmap pages subsequently, right?

Sorry, it is reasonable to continue to optimize the vmemmap pages
subsequently since it should succeed because those vmemmap pages
have been split successfully previously.

Seems we should continue to optimize vmemmap once hugetlb_vmemmap_split()
fails, then we will have more memory to continue to split. But it will
make hugetlb_vmemmap_optimize_folios() a little complex. I'd like to
hear you guys' opinions here.

Thanks.

>
> Thanks.
>
> > +
> > +     flush_tlb_all();
> > +
> >       list_for_each_entry(folio, folio_list, lru)
> >               __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
> >
>
Joao Martins Sept. 6, 2023, 9:13 a.m. UTC | #3
On 06/09/2023 09:24, Muchun Song wrote:
> On 2023/9/6 05:44, Mike Kravetz wrote:
>> From: Joao Martins <joao.m.martins@oracle.com>
>>
>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>> belonging to a range of pages in order to perform only 1 (global) TLB
>> flush.
>>
>> Rebased and updated by Mike Kravetz
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>   mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 68 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index a715712df831..d956551699bc 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>>       struct list_head    *vmemmap_pages;
>>   };
>>   -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>   {
>>       pmd_t __pmd;
>>       int i;
>> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long
>> start)
>>           /* Make pte visible before pmd. See comment in pmd_install(). */
>>           smp_wmb();
>>           pmd_populate_kernel(&init_mm, pmd, pgtable);
>> -        flush_tlb_kernel_range(start, start + PMD_SIZE);
>> +        if (flush)
>> +            flush_tlb_kernel_range(start, start + PMD_SIZE);
>>       } else {
>>           pte_free_kernel(&init_mm, pgtable);
>>       }
>> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long
>> addr,
>>       do {
>>           int ret;
>>   -        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>> +        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>> +                walk->remap_pte != NULL);
> 
> It is bettter to only make @walk->remap_pte indicate whether we should go
> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
> to indicate whether we should flush the TLB at pmd level. It'll be more clear.
> 
Part of the reason I did this was to differentiate between an explicit split()
from a split() occuring during a remap of a page. So we would batch flush on
split, while flush on each PMD on a remap. But OK, maybe this doesn't matter
much if we end up returning earlier down below as you suggest

>>           if (ret)
>>               return ret;
>>             next = pmd_addr_end(addr, end);
>> +
>> +        /*
>> +         * We are only splitting, not remapping the hugetlb vmemmap
>> +         * pages.
>> +         */
>> +        if (!walk->remap_pte)
>> +            continue;
>> +
>>           vmemmap_pte_range(pmd, addr, next, walk);
>>       } while (pmd++, addr = next, addr != end);
>>   @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start,
>> unsigned long end,
>>               return ret;
>>       } while (pgd++, addr = next, addr != end);
>>   -    flush_tlb_kernel_range(start, end);
>> +    if (walk->remap_pte)
>> +        flush_tlb_kernel_range(start, end);
>>         return 0;
>>   }
>> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long
>> addr,
>>       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>   }
>>   +/**
>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>> + *                      backing PMDs of the directmap into PTEs
>> + * @start:     start address of the vmemmap virtual address range that we want
>> + *             to remap.
>> + * @end:       end address of the vmemmap virtual address range that we want to
>> + *             remap.
>> + * @reuse:     reuse address.
>> + *
>> + * Return: %0 on success, negative error code otherwise.
>> + */
>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>> +                unsigned long reuse)
>> +{
>> +    int ret;
>> +    struct vmemmap_remap_walk walk = {
>> +        .remap_pte    = NULL,
>> +    };
>> +
>> +    /* See the comment in the vmemmap_remap_free(). */
>> +    BUG_ON(start - reuse != PAGE_SIZE);
>> +
>> +    mmap_read_lock(&init_mm);
>> +    ret = vmemmap_remap_range(reuse, end, &walk);
>> +    mmap_read_unlock(&init_mm);
>> +
>> +    return ret;
>> +}
>> +
>>   /**
>>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>    *            to the page which @reuse is mapped to, then free vmemmap
>> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h,
>> struct page *head)
>>       free_vmemmap_page_list(&vmemmap_pages);
>>   }
>>   +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>> +{
>> +    unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>> +    unsigned long vmemmap_reuse;
>> +
>> +    if (!vmemmap_should_optimize(h, head))
>> +        return;
>> +
>> +    vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>> +    vmemmap_reuse   = vmemmap_start;
>> +    vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>> +
>> +    /*
>> +     * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>> +     * @vmemmap_end]
>> +     */
>> +    vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>> +}
>> +
>>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>> *folio_list)
>>   {
>>       struct folio *folio;
>>       LIST_HEAD(vmemmap_pages);
>>   +    list_for_each_entry(folio, folio_list, lru)
>> +        hugetlb_vmemmap_split(h, &folio->page);
> 
> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
> to indicate whether it has done successfully, if it fails, it must be
> OOM, in which case, there is no sense to continue to split the page talbe
> and optimize the vmemmap pages subsequently, right?
> 
I suppose that makes sense. hugetlb_vmemmap_split() already returns the error,
it's just testing and break the loop into flush_tlb_all()

> Thanks.
> 
>> +
>> +    flush_tlb_all();
>> +
>>       list_for_each_entry(folio, folio_list, lru)
>>           __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
>>   
>
Joao Martins Sept. 6, 2023, 9:26 a.m. UTC | #4
On 06/09/2023 10:11, Muchun Song wrote:
> On Wed, Sep 6, 2023 at 4:25 PM Muchun Song <muchun.song@linux.dev> wrote:
>>
>>
>>
>> On 2023/9/6 05:44, Mike Kravetz wrote:
>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>
>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>> flush.
>>>
>>> Rebased and updated by Mike Kravetz
>>>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>   mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>>>   1 file changed, 68 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index a715712df831..d956551699bc 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>>>       struct list_head        *vmemmap_pages;
>>>   };
>>>
>>> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>>   {
>>>       pmd_t __pmd;
>>>       int i;
>>> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>               /* Make pte visible before pmd. See comment in pmd_install(). */
>>>               smp_wmb();
>>>               pmd_populate_kernel(&init_mm, pmd, pgtable);
>>> -             flush_tlb_kernel_range(start, start + PMD_SIZE);
>>> +             if (flush)
>>> +                     flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>       } else {
>>>               pte_free_kernel(&init_mm, pgtable);
>>>       }
>>> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>>       do {
>>>               int ret;
>>>
>>> -             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>>> +             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>>> +                             walk->remap_pte != NULL);
>>
>> It is bettter to only make @walk->remap_pte indicate whether we should go
>> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
>> to indicate whether we should flush the TLB at pmd level. It'll be more
>> clear.
>>
>>>               if (ret)
>>>                       return ret;
>>>
>>>               next = pmd_addr_end(addr, end);
>>> +
>>> +             /*
>>> +              * We are only splitting, not remapping the hugetlb vmemmap
>>> +              * pages.
>>> +              */
>>> +             if (!walk->remap_pte)
>>> +                     continue;
>>> +
>>>               vmemmap_pte_range(pmd, addr, next, walk);
>>>       } while (pmd++, addr = next, addr != end);
>>>
>>> @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>>>                       return ret;
>>>       } while (pgd++, addr = next, addr != end);
>>>
>>> -     flush_tlb_kernel_range(start, end);
>>> +     if (walk->remap_pte)
>>> +             flush_tlb_kernel_range(start, end);
>>>
>>>       return 0;
>>>   }
>>> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>   }
>>>
>>> +/**
>>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>>> + *                      backing PMDs of the directmap into PTEs
>>> + * @start:     start address of the vmemmap virtual address range that we want
>>> + *             to remap.
>>> + * @end:       end address of the vmemmap virtual address range that we want to
>>> + *             remap.
>>> + * @reuse:     reuse address.
>>> + *
>>> + * Return: %0 on success, negative error code otherwise.
>>> + */
>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>> +                             unsigned long reuse)
>>> +{
>>> +     int ret;
>>> +     struct vmemmap_remap_walk walk = {
>>> +             .remap_pte      = NULL,
>>> +     };
>>> +
>>> +     /* See the comment in the vmemmap_remap_free(). */
>>> +     BUG_ON(start - reuse != PAGE_SIZE);
>>> +
>>> +     mmap_read_lock(&init_mm);
>>> +     ret = vmemmap_remap_range(reuse, end, &walk);
>>> +     mmap_read_unlock(&init_mm);
>>> +
>>> +     return ret;
>>> +}
>>> +
>>>   /**
>>>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>>    *                  to the page which @reuse is mapped to, then free vmemmap
>>> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>>>       free_vmemmap_page_list(&vmemmap_pages);
>>>   }
>>>
>>> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>> +{
>>> +     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>> +     unsigned long vmemmap_reuse;
>>> +
>>> +     if (!vmemmap_should_optimize(h, head))
>>> +             return;
>>> +
>>> +     vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>> +     vmemmap_reuse   = vmemmap_start;
>>> +     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>> +
>>> +     /*
>>> +      * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>> +      * @vmemmap_end]
>>> +      */
>>> +     vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>> +}
>>> +
>>>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>>>   {
>>>       struct folio *folio;
>>>       LIST_HEAD(vmemmap_pages);
>>>
>>> +     list_for_each_entry(folio, folio_list, lru)
>>> +             hugetlb_vmemmap_split(h, &folio->page);
>>
>> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
>> to indicate whether it has done successfully, if it fails, it must be
>> OOM, in which case, there is no sense to continue to split the page table
>> and optimize the vmemmap pages subsequently, right?
> 
> Sorry, it is reasonable to continue to optimize the vmemmap pages
> subsequently since it should succeed because those vmemmap pages
> have been split successfully previously.
> 
> Seems we should continue to optimize vmemmap once hugetlb_vmemmap_split()
> fails, then we will have more memory to continue to split. 

Good point

> But it will
> make hugetlb_vmemmap_optimize_folios() a little complex. I'd like to
> hear you guys' opinions here.
> 
I think it won't add that much complexity if we don't optimize too much of the
slowpath (when we are out of memory). In the batch freeing patch we could
additionally test the return value of __hugetlb_vmemmap_optimize() for ENOMEM
and free the currently stored vmemmap_pages (if any), and keep iterating the
optimize loop. Should be simple enough and make this a bit more resilient to
that scenario. But we would need to keep the earlier check you commented above
(where we use @remap_pte to defer PMD flush).

> Thanks.
> 
>>
>> Thanks.
>>
>>> +
>>> +     flush_tlb_all();
>>> +
>>>       list_for_each_entry(folio, folio_list, lru)
>>>               __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
>>>
>>
Muchun Song Sept. 6, 2023, 9:32 a.m. UTC | #5
> On Sep 6, 2023, at 17:26, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> 
> 
> On 06/09/2023 10:11, Muchun Song wrote:
>> On Wed, Sep 6, 2023 at 4:25 PM Muchun Song <muchun.song@linux.dev> wrote:
>>> 
>>> 
>>> 
>>> On 2023/9/6 05:44, Mike Kravetz wrote:
>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>> 
>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>>> flush.
>>>> 
>>>> Rebased and updated by Mike Kravetz
>>>> 
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>> ---
>>>>  mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>>>>  1 file changed, 68 insertions(+), 4 deletions(-)
>>>> 
>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>> index a715712df831..d956551699bc 100644
>>>> --- a/mm/hugetlb_vmemmap.c
>>>> +++ b/mm/hugetlb_vmemmap.c
>>>> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>>>>      struct list_head        *vmemmap_pages;
>>>>  };
>>>> 
>>>> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>>>  {
>>>>      pmd_t __pmd;
>>>>      int i;
>>>> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>>              /* Make pte visible before pmd. See comment in pmd_install(). */
>>>>              smp_wmb();
>>>>              pmd_populate_kernel(&init_mm, pmd, pgtable);
>>>> -             flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>> +             if (flush)
>>>> +                     flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>>      } else {
>>>>              pte_free_kernel(&init_mm, pgtable);
>>>>      }
>>>> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>>>      do {
>>>>              int ret;
>>>> 
>>>> -             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>>>> +             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>>>> +                             walk->remap_pte != NULL);
>>> 
>>> It is bettter to only make @walk->remap_pte indicate whether we should go
>>> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
>>> to indicate whether we should flush the TLB at pmd level. It'll be more
>>> clear.
>>> 
>>>>              if (ret)
>>>>                      return ret;
>>>> 
>>>>              next = pmd_addr_end(addr, end);
>>>> +
>>>> +             /*
>>>> +              * We are only splitting, not remapping the hugetlb vmemmap
>>>> +              * pages.
>>>> +              */
>>>> +             if (!walk->remap_pte)
>>>> +                     continue;
>>>> +
>>>>              vmemmap_pte_range(pmd, addr, next, walk);
>>>>      } while (pmd++, addr = next, addr != end);
>>>> 
>>>> @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>>>>                      return ret;
>>>>      } while (pgd++, addr = next, addr != end);
>>>> 
>>>> -     flush_tlb_kernel_range(start, end);
>>>> +     if (walk->remap_pte)
>>>> +             flush_tlb_kernel_range(start, end);
>>>> 
>>>>      return 0;
>>>>  }
>>>> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>>      set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>>  }
>>>> 
>>>> +/**
>>>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>>>> + *                      backing PMDs of the directmap into PTEs
>>>> + * @start:     start address of the vmemmap virtual address range that we want
>>>> + *             to remap.
>>>> + * @end:       end address of the vmemmap virtual address range that we want to
>>>> + *             remap.
>>>> + * @reuse:     reuse address.
>>>> + *
>>>> + * Return: %0 on success, negative error code otherwise.
>>>> + */
>>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>>> +                             unsigned long reuse)
>>>> +{
>>>> +     int ret;
>>>> +     struct vmemmap_remap_walk walk = {
>>>> +             .remap_pte      = NULL,
>>>> +     };
>>>> +
>>>> +     /* See the comment in the vmemmap_remap_free(). */
>>>> +     BUG_ON(start - reuse != PAGE_SIZE);
>>>> +
>>>> +     mmap_read_lock(&init_mm);
>>>> +     ret = vmemmap_remap_range(reuse, end, &walk);
>>>> +     mmap_read_unlock(&init_mm);
>>>> +
>>>> +     return ret;
>>>> +}
>>>> +
>>>>  /**
>>>>   * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>>>   *                  to the page which @reuse is mapped to, then free vmemmap
>>>> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>>>>      free_vmemmap_page_list(&vmemmap_pages);
>>>>  }
>>>> 
>>>> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>>> +{
>>>> +     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>>> +     unsigned long vmemmap_reuse;
>>>> +
>>>> +     if (!vmemmap_should_optimize(h, head))
>>>> +             return;
>>>> +
>>>> +     vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>>> +     vmemmap_reuse   = vmemmap_start;
>>>> +     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>> +
>>>> +     /*
>>>> +      * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>>> +      * @vmemmap_end]
>>>> +      */
>>>> +     vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>>> +}
>>>> +
>>>>  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>>>>  {
>>>>      struct folio *folio;
>>>>      LIST_HEAD(vmemmap_pages);
>>>> 
>>>> +     list_for_each_entry(folio, folio_list, lru)
>>>> +             hugetlb_vmemmap_split(h, &folio->page);
>>> 
>>> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
>>> to indicate whether it has done successfully, if it fails, it must be
>>> OOM, in which case, there is no sense to continue to split the page table
>>> and optimize the vmemmap pages subsequently, right?
>> 
>> Sorry, it is reasonable to continue to optimize the vmemmap pages
>> subsequently since it should succeed because those vmemmap pages
>> have been split successfully previously.
>> 
>> Seems we should continue to optimize vmemmap once hugetlb_vmemmap_split()
>> fails, then we will have more memory to continue to split. 
> 
> Good point
> 
>> But it will
>> make hugetlb_vmemmap_optimize_folios() a little complex. I'd like to
>> hear you guys' opinions here.
>> 
> I think it won't add that much complexity if we don't optimize too much of the
> slowpath (when we are out of memory). In the batch freeing patch we could
> additionally test the return value of __hugetlb_vmemmap_optimize() for ENOMEM
> and free the currently stored vmemmap_pages (if any), and keep iterating the
> optimize loop. Should be simple enough and make this a bit more resilient to
> that scenario.

Yep, we could try this.

> But we would need to keep the earlier check you commented above
> (where we use @remap_pte to defer PMD flush).

I think 2 flags will suitable for you, one is VMEMMAP_REMAP_NO_TLB_FLUSH,
another is VMEMMAP_SPLIT_NO_TLB_FLUSH.

Thanks.

> 
>> Thanks.
>> 
>>> 
>>> Thanks.
>>> 
>>>> +
>>>> +     flush_tlb_all();
>>>> +
>>>>      list_for_each_entry(folio, folio_list, lru)
>>>>              __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
Joao Martins Sept. 6, 2023, 9:44 a.m. UTC | #6
On 06/09/2023 10:32, Muchun Song wrote:
>> On Sep 6, 2023, at 17:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 06/09/2023 10:11, Muchun Song wrote:
>>> On Wed, Sep 6, 2023 at 4:25 PM Muchun Song <muchun.song@linux.dev> wrote:
>>>> On 2023/9/6 05:44, Mike Kravetz wrote:
>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>
>>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>>>> flush.
>>>>>
>>>>> Rebased and updated by Mike Kravetz
>>>>>
>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>> ---
>>>>>  mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>>>>>  1 file changed, 68 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>>> index a715712df831..d956551699bc 100644
>>>>> --- a/mm/hugetlb_vmemmap.c
>>>>> +++ b/mm/hugetlb_vmemmap.c
>>>>> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>>>>>      struct list_head        *vmemmap_pages;
>>>>>  };
>>>>>
>>>>> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>>>>  {
>>>>>      pmd_t __pmd;
>>>>>      int i;
>>>>> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>>>              /* Make pte visible before pmd. See comment in pmd_install(). */
>>>>>              smp_wmb();
>>>>>              pmd_populate_kernel(&init_mm, pmd, pgtable);
>>>>> -             flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>>> +             if (flush)
>>>>> +                     flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>>>      } else {
>>>>>              pte_free_kernel(&init_mm, pgtable);
>>>>>      }
>>>>> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>>>>      do {
>>>>>              int ret;
>>>>>
>>>>> -             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>>>>> +             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>>>>> +                             walk->remap_pte != NULL);
>>>>
>>>> It is bettter to only make @walk->remap_pte indicate whether we should go
>>>> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
>>>> to indicate whether we should flush the TLB at pmd level. It'll be more
>>>> clear.
>>>>
>>>>>              if (ret)
>>>>>                      return ret;
>>>>>
>>>>>              next = pmd_addr_end(addr, end);
>>>>> +
>>>>> +             /*
>>>>> +              * We are only splitting, not remapping the hugetlb vmemmap
>>>>> +              * pages.
>>>>> +              */
>>>>> +             if (!walk->remap_pte)
>>>>> +                     continue;
>>>>> +
>>>>>              vmemmap_pte_range(pmd, addr, next, walk);
>>>>>      } while (pmd++, addr = next, addr != end);
>>>>>
>>>>> @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>>>>>                      return ret;
>>>>>      } while (pgd++, addr = next, addr != end);
>>>>>
>>>>> -     flush_tlb_kernel_range(start, end);
>>>>> +     if (walk->remap_pte)
>>>>> +             flush_tlb_kernel_range(start, end);
>>>>>
>>>>>      return 0;
>>>>>  }
>>>>> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>>>      set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>>>  }
>>>>>
>>>>> +/**
>>>>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>>>>> + *                      backing PMDs of the directmap into PTEs
>>>>> + * @start:     start address of the vmemmap virtual address range that we want
>>>>> + *             to remap.
>>>>> + * @end:       end address of the vmemmap virtual address range that we want to
>>>>> + *             remap.
>>>>> + * @reuse:     reuse address.
>>>>> + *
>>>>> + * Return: %0 on success, negative error code otherwise.
>>>>> + */
>>>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>>>> +                             unsigned long reuse)
>>>>> +{
>>>>> +     int ret;
>>>>> +     struct vmemmap_remap_walk walk = {
>>>>> +             .remap_pte      = NULL,
>>>>> +     };
>>>>> +
>>>>> +     /* See the comment in the vmemmap_remap_free(). */
>>>>> +     BUG_ON(start - reuse != PAGE_SIZE);
>>>>> +
>>>>> +     mmap_read_lock(&init_mm);
>>>>> +     ret = vmemmap_remap_range(reuse, end, &walk);
>>>>> +     mmap_read_unlock(&init_mm);
>>>>> +
>>>>> +     return ret;
>>>>> +}
>>>>> +
>>>>>  /**
>>>>>   * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>>>>   *                  to the page which @reuse is mapped to, then free vmemmap
>>>>> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>>>>>      free_vmemmap_page_list(&vmemmap_pages);
>>>>>  }
>>>>>
>>>>> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>>>> +{
>>>>> +     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>>>> +     unsigned long vmemmap_reuse;
>>>>> +
>>>>> +     if (!vmemmap_should_optimize(h, head))
>>>>> +             return;
>>>>> +
>>>>> +     vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>>>> +     vmemmap_reuse   = vmemmap_start;
>>>>> +     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>>> +
>>>>> +     /*
>>>>> +      * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>>>> +      * @vmemmap_end]
>>>>> +      */
>>>>> +     vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>>>> +}
>>>>> +
>>>>>  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>>>>>  {
>>>>>      struct folio *folio;
>>>>>      LIST_HEAD(vmemmap_pages);
>>>>>
>>>>> +     list_for_each_entry(folio, folio_list, lru)
>>>>> +             hugetlb_vmemmap_split(h, &folio->page);
>>>>
>>>> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
>>>> to indicate whether it has done successfully, if it fails, it must be
>>>> OOM, in which case, there is no sense to continue to split the page table
>>>> and optimize the vmemmap pages subsequently, right?
>>>
>>> Sorry, it is reasonable to continue to optimize the vmemmap pages
>>> subsequently since it should succeed because those vmemmap pages
>>> have been split successfully previously.
>>>
>>> Seems we should continue to optimize vmemmap once hugetlb_vmemmap_split()
>>> fails, then we will have more memory to continue to split. 
>>
>> Good point
>>
>>> But it will
>>> make hugetlb_vmemmap_optimize_folios() a little complex. I'd like to
>>> hear you guys' opinions here.
>>>
>> I think it won't add that much complexity if we don't optimize too much of the
>> slowpath (when we are out of memory). In the batch freeing patch we could
>> additionally test the return value of __hugetlb_vmemmap_optimize() for ENOMEM
>> and free the currently stored vmemmap_pages (if any), and keep iterating the
>> optimize loop. Should be simple enough and make this a bit more resilient to
>> that scenario.
> 
> Yep, we could try this.
> 
>> But we would need to keep the earlier check you commented above
>> (where we use @remap_pte to defer PMD flush).
> 
> I think 2 flags will suitable for you, one is VMEMMAP_REMAP_NO_TLB_FLUSH,
> another is VMEMMAP_SPLIT_NO_TLB_FLUSH.

This means going back to the v1. I thought we agreed to consolidate/simplify
into one flag, and use @remap_pte to differentiate between split and remap.
Muchun Song Sept. 6, 2023, 11:34 a.m. UTC | #7
> On Sep 6, 2023, at 17:44, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 06/09/2023 10:32, Muchun Song wrote:
>>> On Sep 6, 2023, at 17:26, Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 06/09/2023 10:11, Muchun Song wrote:
>>>> On Wed, Sep 6, 2023 at 4:25 PM Muchun Song <muchun.song@linux.dev> wrote:
>>>>> On 2023/9/6 05:44, Mike Kravetz wrote:
>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>> 
>>>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>>>>>> belonging to a range of pages in order to perform only 1 (global) TLB
>>>>>> flush.
>>>>>> 
>>>>>> Rebased and updated by Mike Kravetz
>>>>>> 
>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>>> ---
>>>>>> mm/hugetlb_vmemmap.c | 72 +++++++++++++++++++++++++++++++++++++++++---
>>>>>> 1 file changed, 68 insertions(+), 4 deletions(-)
>>>>>> 
>>>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>>>> index a715712df831..d956551699bc 100644
>>>>>> --- a/mm/hugetlb_vmemmap.c
>>>>>> +++ b/mm/hugetlb_vmemmap.c
>>>>>> @@ -37,7 +37,7 @@ struct vmemmap_remap_walk {
>>>>>>     struct list_head        *vmemmap_pages;
>>>>>> };
>>>>>> 
>>>>>> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>>>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
>>>>>> {
>>>>>>     pmd_t __pmd;
>>>>>>     int i;
>>>>>> @@ -80,7 +80,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>>>>>>             /* Make pte visible before pmd. See comment in pmd_install(). */
>>>>>>             smp_wmb();
>>>>>>             pmd_populate_kernel(&init_mm, pmd, pgtable);
>>>>>> -             flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>>>> +             if (flush)
>>>>>> +                     flush_tlb_kernel_range(start, start + PMD_SIZE);
>>>>>>     } else {
>>>>>>             pte_free_kernel(&init_mm, pgtable);
>>>>>>     }
>>>>>> @@ -127,11 +128,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>>>>>>     do {
>>>>>>             int ret;
>>>>>> 
>>>>>> -             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>>>>>> +             ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
>>>>>> +                             walk->remap_pte != NULL);
>>>>> 
>>>>> It is bettter to only make @walk->remap_pte indicate whether we should go
>>>>> to the last page table level. I suggest reusing VMEMMAP_NO_TLB_FLUSH
>>>>> to indicate whether we should flush the TLB at pmd level. It'll be more
>>>>> clear.
>>>>> 
>>>>>>             if (ret)
>>>>>>                     return ret;
>>>>>> 
>>>>>>             next = pmd_addr_end(addr, end);
>>>>>> +
>>>>>> +             /*
>>>>>> +              * We are only splitting, not remapping the hugetlb vmemmap
>>>>>> +              * pages.
>>>>>> +              */
>>>>>> +             if (!walk->remap_pte)
>>>>>> +                     continue;
>>>>>> +
>>>>>>             vmemmap_pte_range(pmd, addr, next, walk);
>>>>>>     } while (pmd++, addr = next, addr != end);
>>>>>> 
>>>>>> @@ -198,7 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>>>>>>                     return ret;
>>>>>>     } while (pgd++, addr = next, addr != end);
>>>>>> 
>>>>>> -     flush_tlb_kernel_range(start, end);
>>>>>> +     if (walk->remap_pte)
>>>>>> +             flush_tlb_kernel_range(start, end);
>>>>>> 
>>>>>>     return 0;
>>>>>> }
>>>>>> @@ -297,6 +308,35 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>>>>     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>>>> }
>>>>>> 
>>>>>> +/**
>>>>>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>>>>>> + *                      backing PMDs of the directmap into PTEs
>>>>>> + * @start:     start address of the vmemmap virtual address range that we want
>>>>>> + *             to remap.
>>>>>> + * @end:       end address of the vmemmap virtual address range that we want to
>>>>>> + *             remap.
>>>>>> + * @reuse:     reuse address.
>>>>>> + *
>>>>>> + * Return: %0 on success, negative error code otherwise.
>>>>>> + */
>>>>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>>>>> +                             unsigned long reuse)
>>>>>> +{
>>>>>> +     int ret;
>>>>>> +     struct vmemmap_remap_walk walk = {
>>>>>> +             .remap_pte      = NULL,
>>>>>> +     };
>>>>>> +
>>>>>> +     /* See the comment in the vmemmap_remap_free(). */
>>>>>> +     BUG_ON(start - reuse != PAGE_SIZE);
>>>>>> +
>>>>>> +     mmap_read_lock(&init_mm);
>>>>>> +     ret = vmemmap_remap_range(reuse, end, &walk);
>>>>>> +     mmap_read_unlock(&init_mm);
>>>>>> +
>>>>>> +     return ret;
>>>>>> +}
>>>>>> +
>>>>>> /**
>>>>>>  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>>>>>  *                  to the page which @reuse is mapped to, then free vmemmap
>>>>>> @@ -602,11 +642,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>>>>>>     free_vmemmap_page_list(&vmemmap_pages);
>>>>>> }
>>>>>> 
>>>>>> +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>>>>>> +{
>>>>>> +     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>>>>>> +     unsigned long vmemmap_reuse;
>>>>>> +
>>>>>> +     if (!vmemmap_should_optimize(h, head))
>>>>>> +             return;
>>>>>> +
>>>>>> +     vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>>>>>> +     vmemmap_reuse   = vmemmap_start;
>>>>>> +     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>>>> +
>>>>>> +     /*
>>>>>> +      * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>>>>> +      * @vmemmap_end]
>>>>>> +      */
>>>>>> +     vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>>>>> +}
>>>>>> +
>>>>>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>>>>>> {
>>>>>>     struct folio *folio;
>>>>>>     LIST_HEAD(vmemmap_pages);
>>>>>> 
>>>>>> +     list_for_each_entry(folio, folio_list, lru)
>>>>>> +             hugetlb_vmemmap_split(h, &folio->page);
>>>>> 
>>>>> Maybe it is reasonable to add a return value to hugetlb_vmemmap_split()
>>>>> to indicate whether it has done successfully, if it fails, it must be
>>>>> OOM, in which case, there is no sense to continue to split the page table
>>>>> and optimize the vmemmap pages subsequently, right?
>>>> 
>>>> Sorry, it is reasonable to continue to optimize the vmemmap pages
>>>> subsequently since it should succeed because those vmemmap pages
>>>> have been split successfully previously.
>>>> 
>>>> Seems we should continue to optimize vmemmap once hugetlb_vmemmap_split()
>>>> fails, then we will have more memory to continue to split. 
>>> 
>>> Good point
>>> 
>>>> But it will
>>>> make hugetlb_vmemmap_optimize_folios() a little complex. I'd like to
>>>> hear you guys' opinions here.
>>>> 
>>> I think it won't add that much complexity if we don't optimize too much of the
>>> slowpath (when we are out of memory). In the batch freeing patch we could
>>> additionally test the return value of __hugetlb_vmemmap_optimize() for ENOMEM
>>> and free the currently stored vmemmap_pages (if any), and keep iterating the
>>> optimize loop. Should be simple enough and make this a bit more resilient to
>>> that scenario.
>> 
>> Yep, we could try this.
>> 
>>> But we would need to keep the earlier check you commented above
>>> (where we use @remap_pte to defer PMD flush).
>> 
>> I think 2 flags will suitable for you, one is VMEMMAP_REMAP_NO_TLB_FLUSH,
>> another is VMEMMAP_SPLIT_NO_TLB_FLUSH.
> 
> This means going back to the v1. I thought we agreed to consolidate/simplify
> into one flag, and use @remap_pte to differentiate between split and remap.

But a little different, we use @remap_pte to indicate whether we should go
to the last level (e.g. do the remap), the flag is used to indicate whether
we should flush TLB when splitting is necessary (note that remap also need
split). It means split and non-TLB flush are not bound. Sorry, I just want
to keep the semantics clear.

Thanks.
diff mbox series

Patch

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index a715712df831..d956551699bc 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -37,7 +37,7 @@  struct vmemmap_remap_walk {
 	struct list_head	*vmemmap_pages;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
 {
 	pmd_t __pmd;
 	int i;
@@ -80,7 +80,8 @@  static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 		/* Make pte visible before pmd. See comment in pmd_install(). */
 		smp_wmb();
 		pmd_populate_kernel(&init_mm, pmd, pgtable);
-		flush_tlb_kernel_range(start, start + PMD_SIZE);
+		if (flush)
+			flush_tlb_kernel_range(start, start + PMD_SIZE);
 	} else {
 		pte_free_kernel(&init_mm, pgtable);
 	}
@@ -127,11 +128,20 @@  static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 	do {
 		int ret;
 
-		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
+				walk->remap_pte != NULL);
 		if (ret)
 			return ret;
 
 		next = pmd_addr_end(addr, end);
+
+		/*
+		 * We are only splitting, not remapping the hugetlb vmemmap
+		 * pages.
+		 */
+		if (!walk->remap_pte)
+			continue;
+
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
 
@@ -198,7 +208,8 @@  static int vmemmap_remap_range(unsigned long start, unsigned long end,
 			return ret;
 	} while (pgd++, addr = next, addr != end);
 
-	flush_tlb_kernel_range(start, end);
+	if (walk->remap_pte)
+		flush_tlb_kernel_range(start, end);
 
 	return 0;
 }
@@ -297,6 +308,35 @@  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
 
+/**
+ * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
+ *                      backing PMDs of the directmap into PTEs
+ * @start:     start address of the vmemmap virtual address range that we want
+ *             to remap.
+ * @end:       end address of the vmemmap virtual address range that we want to
+ *             remap.
+ * @reuse:     reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_split(unsigned long start, unsigned long end,
+				unsigned long reuse)
+{
+	int ret;
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= NULL,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	mmap_read_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
+
+	return ret;
+}
+
 /**
  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  *			to the page which @reuse is mapped to, then free vmemmap
@@ -602,11 +642,35 @@  void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
+
+	if (!vmemmap_should_optimize(h, head))
+		return;
+
+	vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse   = vmemmap_start;
+	vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+	/*
+	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
+	 * @vmemmap_end]
+	 */
+	vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
+}
+
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 {
 	struct folio *folio;
 	LIST_HEAD(vmemmap_pages);
 
+	list_for_each_entry(folio, folio_list, lru)
+		hugetlb_vmemmap_split(h, &folio->page);
+
+	flush_tlb_all();
+
 	list_for_each_entry(folio, folio_list, lru)
 		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);