diff mbox series

[v13,05/12] mm: hugetlb: allocate the vmemmap pages associated with each HugeTLB page

Message ID 20210117151053.24600-6-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of HugeTLB page | expand

Commit Message

Muchun Song Jan. 17, 2021, 3:10 p.m. UTC
When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. We can do that in the __free_hugepage()
before freeing it to buddy.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h   |  2 ++
 mm/hugetlb.c         |  2 ++
 mm/hugetlb_vmemmap.c | 15 ++++++++++
 mm/hugetlb_vmemmap.h |  5 ++++
 mm/sparse-vmemmap.c  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 100 insertions(+), 1 deletion(-)

Comments

David Rientjes Jan. 25, 2021, 12:05 a.m. UTC | #1
On Sun, 17 Jan 2021, Muchun Song wrote:

> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index ce4be1fa93c2..3b146d5949f3 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched.h>
>  #include <linux/pgtable.h>
>  #include <linux/bootmem_info.h>
> +#include <linux/delay.h>
>  
>  #include <asm/dma.h>
>  #include <asm/pgalloc.h>
> @@ -40,7 +41,8 @@
>   * @remap_pte:		called for each non-empty PTE (lowest-level) entry.
>   * @reuse_page:		the page which is reused for the tail vmemmap pages.
>   * @reuse_addr:		the virtual address of the @reuse_page page.
> - * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
> + * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
> + *			or is mapped from.
>   */
>  struct vmemmap_remap_walk {
>  	void (*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
>  	struct list_head *vmemmap_pages;
>  };
>  
> +/* The gfp mask of allocating vmemmap page */
> +#define GFP_VMEMMAP_PAGE		\
> +	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> +

This is unnecessary, just use the gfp mask directly in allocator.

>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  			      unsigned long end,
>  			      struct vmemmap_remap_walk *walk)
> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>  	free_vmemmap_page_list(&vmemmap_pages);
>  }
>  
> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> +				struct vmemmap_remap_walk *walk)
> +{
> +	pgprot_t pgprot = PAGE_KERNEL;
> +	struct page *page;
> +	void *to;
> +
> +	BUG_ON(pte_page(*pte) != walk->reuse_page);
> +
> +	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> +	list_del(&page->lru);
> +	to = page_to_virt(page);
> +	copy_page(to, (void *)walk->reuse_addr);
> +
> +	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +}
> +
> +static void alloc_vmemmap_page_list(struct list_head *list,
> +				    unsigned long start, unsigned long end)
> +{
> +	unsigned long addr;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE) {
> +		struct page *page;
> +		int nid = page_to_nid((const void *)addr);
> +
> +retry:
> +		page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> +		if (unlikely(!page)) {
> +			msleep(100);
> +			/*
> +			 * We should retry infinitely, because we cannot
> +			 * handle allocation failures. Once we allocate
> +			 * vmemmap pages successfully, then we can free
> +			 * a HugeTLB page.
> +			 */
> +			goto retry;

Ugh, I don't think this will work, there's no guarantee that we'll ever 
succeed and now we can't free a 2MB hugepage because we cannot allocate a 
4KB page.  We absolutely have to ensure we make forward progress here.

We're going to be freeing the hugetlb page after this succeeeds, can we 
not use part of the hugetlb page that we're freeing for this memory 
instead?

> +		}
> +		list_add_tail(&page->lru, list);
> +	}
> +}
> +
> +/**
> + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> + *			 to the page which is from the @vmemmap_pages
> + *			 respectively.
> + * @start:	start address of the vmemmap virtual address range.
> + * @end:	end address of the vmemmap virtual address range.
> + * @reuse:	reuse address.
> + */
> +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +			 unsigned long reuse)
> +{
> +	LIST_HEAD(vmemmap_pages);
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= vmemmap_restore_pte,
> +		.reuse_addr	= reuse,
> +		.vmemmap_pages	= &vmemmap_pages,
> +	};
> +
> +	might_sleep();
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	alloc_vmemmap_page_list(&vmemmap_pages, start, end);
> +	vmemmap_remap_range(reuse, end, &walk);
> +}
> +
>  /*
>   * Allocate a block of memory to be used to back the virtual memory map
>   * or to back the page tables that are used to create the mapping.
> -- 
> 2.11.0
> 
>
Muchun Song Jan. 25, 2021, 6:40 a.m. UTC | #2
On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote:
>
>
> On Sun, 17 Jan 2021, Muchun Song wrote:
>
> > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > index ce4be1fa93c2..3b146d5949f3 100644
> > --- a/mm/sparse-vmemmap.c
> > +++ b/mm/sparse-vmemmap.c
> > @@ -29,6 +29,7 @@
> >  #include <linux/sched.h>
> >  #include <linux/pgtable.h>
> >  #include <linux/bootmem_info.h>
> > +#include <linux/delay.h>
> >
> >  #include <asm/dma.h>
> >  #include <asm/pgalloc.h>
> > @@ -40,7 +41,8 @@
> >   * @remap_pte:               called for each non-empty PTE (lowest-level) entry.
> >   * @reuse_page:              the page which is reused for the tail vmemmap pages.
> >   * @reuse_addr:              the virtual address of the @reuse_page page.
> > - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
> > + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
> > + *                   or is mapped from.
> >   */
> >  struct vmemmap_remap_walk {
> >       void (*remap_pte)(pte_t *pte, unsigned long addr,
> > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
> >       struct list_head *vmemmap_pages;
> >  };
> >
> > +/* The gfp mask of allocating vmemmap page */
> > +#define GFP_VMEMMAP_PAGE             \
> > +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> > +
>
> This is unnecessary, just use the gfp mask directly in allocator.

Will do. Thanks.

>
> >  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
> >                             unsigned long end,
> >                             struct vmemmap_remap_walk *walk)
> > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
> >       free_vmemmap_page_list(&vmemmap_pages);
> >  }
> >
> > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> > +                             struct vmemmap_remap_walk *walk)
> > +{
> > +     pgprot_t pgprot = PAGE_KERNEL;
> > +     struct page *page;
> > +     void *to;
> > +
> > +     BUG_ON(pte_page(*pte) != walk->reuse_page);
> > +
> > +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> > +     list_del(&page->lru);
> > +     to = page_to_virt(page);
> > +     copy_page(to, (void *)walk->reuse_addr);
> > +
> > +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> > +}
> > +
> > +static void alloc_vmemmap_page_list(struct list_head *list,
> > +                                 unsigned long start, unsigned long end)
> > +{
> > +     unsigned long addr;
> > +
> > +     for (addr = start; addr < end; addr += PAGE_SIZE) {
> > +             struct page *page;
> > +             int nid = page_to_nid((const void *)addr);
> > +
> > +retry:
> > +             page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> > +             if (unlikely(!page)) {
> > +                     msleep(100);
> > +                     /*
> > +                      * We should retry infinitely, because we cannot
> > +                      * handle allocation failures. Once we allocate
> > +                      * vmemmap pages successfully, then we can free
> > +                      * a HugeTLB page.
> > +                      */
> > +                     goto retry;
>
> Ugh, I don't think this will work, there's no guarantee that we'll ever
> succeed and now we can't free a 2MB hugepage because we cannot allocate a
> 4KB page.  We absolutely have to ensure we make forward progress here.

This can trigger a OOM when there is no memory and kill someone to release
some memory. Right?

>
> We're going to be freeing the hugetlb page after this succeeeds, can we
> not use part of the hugetlb page that we're freeing for this memory
> instead?

It seems a good idea. We can try to allocate memory firstly, if successful,
just use the new page to remap (it can reduce memory fragmentation).
If not, we can use part of the hugetlb page to remap. What's your opinion
about this?

>
> > +             }
> > +             list_add_tail(&page->lru, list);
> > +     }
> > +}
> > +
> > +/**
> > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> > + *                    to the page which is from the @vmemmap_pages
> > + *                    respectively.
> > + * @start:   start address of the vmemmap virtual address range.
> > + * @end:     end address of the vmemmap virtual address range.
> > + * @reuse:   reuse address.
> > + */
> > +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> > +                      unsigned long reuse)
> > +{
> > +     LIST_HEAD(vmemmap_pages);
> > +     struct vmemmap_remap_walk walk = {
> > +             .remap_pte      = vmemmap_restore_pte,
> > +             .reuse_addr     = reuse,
> > +             .vmemmap_pages  = &vmemmap_pages,
> > +     };
> > +
> > +     might_sleep();
> > +
> > +     /* See the comment in the vmemmap_remap_free(). */
> > +     BUG_ON(start - reuse != PAGE_SIZE);
> > +
> > +     alloc_vmemmap_page_list(&vmemmap_pages, start, end);
> > +     vmemmap_remap_range(reuse, end, &walk);
> > +}
> > +
> >  /*
> >   * Allocate a block of memory to be used to back the virtual memory map
> >   * or to back the page tables that are used to create the mapping.
> > --
> > 2.11.0
> >
> >
Muchun Song Jan. 25, 2021, 7:41 a.m. UTC | #3
On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote:
> >
> >
> > On Sun, 17 Jan 2021, Muchun Song wrote:
> >
> > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > > index ce4be1fa93c2..3b146d5949f3 100644
> > > --- a/mm/sparse-vmemmap.c
> > > +++ b/mm/sparse-vmemmap.c
> > > @@ -29,6 +29,7 @@
> > >  #include <linux/sched.h>
> > >  #include <linux/pgtable.h>
> > >  #include <linux/bootmem_info.h>
> > > +#include <linux/delay.h>
> > >
> > >  #include <asm/dma.h>
> > >  #include <asm/pgalloc.h>
> > > @@ -40,7 +41,8 @@
> > >   * @remap_pte:               called for each non-empty PTE (lowest-level) entry.
> > >   * @reuse_page:              the page which is reused for the tail vmemmap pages.
> > >   * @reuse_addr:              the virtual address of the @reuse_page page.
> > > - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
> > > + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
> > > + *                   or is mapped from.
> > >   */
> > >  struct vmemmap_remap_walk {
> > >       void (*remap_pte)(pte_t *pte, unsigned long addr,
> > > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
> > >       struct list_head *vmemmap_pages;
> > >  };
> > >
> > > +/* The gfp mask of allocating vmemmap page */
> > > +#define GFP_VMEMMAP_PAGE             \
> > > +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> > > +
> >
> > This is unnecessary, just use the gfp mask directly in allocator.
>
> Will do. Thanks.
>
> >
> > >  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
> > >                             unsigned long end,
> > >                             struct vmemmap_remap_walk *walk)
> > > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
> > >       free_vmemmap_page_list(&vmemmap_pages);
> > >  }
> > >
> > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> > > +                             struct vmemmap_remap_walk *walk)
> > > +{
> > > +     pgprot_t pgprot = PAGE_KERNEL;
> > > +     struct page *page;
> > > +     void *to;
> > > +
> > > +     BUG_ON(pte_page(*pte) != walk->reuse_page);
> > > +
> > > +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> > > +     list_del(&page->lru);
> > > +     to = page_to_virt(page);
> > > +     copy_page(to, (void *)walk->reuse_addr);
> > > +
> > > +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> > > +}
> > > +
> > > +static void alloc_vmemmap_page_list(struct list_head *list,
> > > +                                 unsigned long start, unsigned long end)
> > > +{
> > > +     unsigned long addr;
> > > +
> > > +     for (addr = start; addr < end; addr += PAGE_SIZE) {
> > > +             struct page *page;
> > > +             int nid = page_to_nid((const void *)addr);
> > > +
> > > +retry:
> > > +             page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> > > +             if (unlikely(!page)) {
> > > +                     msleep(100);
> > > +                     /*
> > > +                      * We should retry infinitely, because we cannot
> > > +                      * handle allocation failures. Once we allocate
> > > +                      * vmemmap pages successfully, then we can free
> > > +                      * a HugeTLB page.
> > > +                      */
> > > +                     goto retry;
> >
> > Ugh, I don't think this will work, there's no guarantee that we'll ever
> > succeed and now we can't free a 2MB hugepage because we cannot allocate a
> > 4KB page.  We absolutely have to ensure we make forward progress here.
>
> This can trigger a OOM when there is no memory and kill someone to release
> some memory. Right?
>
> >
> > We're going to be freeing the hugetlb page after this succeeeds, can we
> > not use part of the hugetlb page that we're freeing for this memory
> > instead?
>
> It seems a good idea. We can try to allocate memory firstly, if successful,
> just use the new page to remap (it can reduce memory fragmentation).
> If not, we can use part of the hugetlb page to remap. What's your opinion
> about this?

If the HugeTLB page is a gigantic page which is allocated from
CMA. In this case, we cannot use part of the hugetlb page to remap.
Right?

>
> >
> > > +             }
> > > +             list_add_tail(&page->lru, list);
> > > +     }
> > > +}
> > > +
> > > +/**
> > > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> > > + *                    to the page which is from the @vmemmap_pages
> > > + *                    respectively.
> > > + * @start:   start address of the vmemmap virtual address range.
> > > + * @end:     end address of the vmemmap virtual address range.
> > > + * @reuse:   reuse address.
> > > + */
> > > +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> > > +                      unsigned long reuse)
> > > +{
> > > +     LIST_HEAD(vmemmap_pages);
> > > +     struct vmemmap_remap_walk walk = {
> > > +             .remap_pte      = vmemmap_restore_pte,
> > > +             .reuse_addr     = reuse,
> > > +             .vmemmap_pages  = &vmemmap_pages,
> > > +     };
> > > +
> > > +     might_sleep();
> > > +
> > > +     /* See the comment in the vmemmap_remap_free(). */
> > > +     BUG_ON(start - reuse != PAGE_SIZE);
> > > +
> > > +     alloc_vmemmap_page_list(&vmemmap_pages, start, end);
> > > +     vmemmap_remap_range(reuse, end, &walk);
> > > +}
> > > +
> > >  /*
> > >   * Allocate a block of memory to be used to back the virtual memory map
> > >   * or to back the page tables that are used to create the mapping.
> > > --
> > > 2.11.0
> > >
> > >
David Hildenbrand Jan. 25, 2021, 9:15 a.m. UTC | #4
On 25.01.21 08:41, Muchun Song wrote:
> On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote:
>>
>> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote:
>>>
>>>
>>> On Sun, 17 Jan 2021, Muchun Song wrote:
>>>
>>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>>>> index ce4be1fa93c2..3b146d5949f3 100644
>>>> --- a/mm/sparse-vmemmap.c
>>>> +++ b/mm/sparse-vmemmap.c
>>>> @@ -29,6 +29,7 @@
>>>>  #include <linux/sched.h>
>>>>  #include <linux/pgtable.h>
>>>>  #include <linux/bootmem_info.h>
>>>> +#include <linux/delay.h>
>>>>
>>>>  #include <asm/dma.h>
>>>>  #include <asm/pgalloc.h>
>>>> @@ -40,7 +41,8 @@
>>>>   * @remap_pte:               called for each non-empty PTE (lowest-level) entry.
>>>>   * @reuse_page:              the page which is reused for the tail vmemmap pages.
>>>>   * @reuse_addr:              the virtual address of the @reuse_page page.
>>>> - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
>>>> + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
>>>> + *                   or is mapped from.
>>>>   */
>>>>  struct vmemmap_remap_walk {
>>>>       void (*remap_pte)(pte_t *pte, unsigned long addr,
>>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
>>>>       struct list_head *vmemmap_pages;
>>>>  };
>>>>
>>>> +/* The gfp mask of allocating vmemmap page */
>>>> +#define GFP_VMEMMAP_PAGE             \
>>>> +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
>>>> +
>>>
>>> This is unnecessary, just use the gfp mask directly in allocator.
>>
>> Will do. Thanks.
>>
>>>
>>>>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>>>>                             unsigned long end,
>>>>                             struct vmemmap_remap_walk *walk)
>>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>>>>       free_vmemmap_page_list(&vmemmap_pages);
>>>>  }
>>>>
>>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>> +                             struct vmemmap_remap_walk *walk)
>>>> +{
>>>> +     pgprot_t pgprot = PAGE_KERNEL;
>>>> +     struct page *page;
>>>> +     void *to;
>>>> +
>>>> +     BUG_ON(pte_page(*pte) != walk->reuse_page);
>>>> +
>>>> +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
>>>> +     list_del(&page->lru);
>>>> +     to = page_to_virt(page);
>>>> +     copy_page(to, (void *)walk->reuse_addr);
>>>> +
>>>> +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>> +}
>>>> +
>>>> +static void alloc_vmemmap_page_list(struct list_head *list,
>>>> +                                 unsigned long start, unsigned long end)
>>>> +{
>>>> +     unsigned long addr;
>>>> +
>>>> +     for (addr = start; addr < end; addr += PAGE_SIZE) {
>>>> +             struct page *page;
>>>> +             int nid = page_to_nid((const void *)addr);
>>>> +
>>>> +retry:
>>>> +             page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
>>>> +             if (unlikely(!page)) {
>>>> +                     msleep(100);
>>>> +                     /*
>>>> +                      * We should retry infinitely, because we cannot
>>>> +                      * handle allocation failures. Once we allocate
>>>> +                      * vmemmap pages successfully, then we can free
>>>> +                      * a HugeTLB page.
>>>> +                      */
>>>> +                     goto retry;
>>>
>>> Ugh, I don't think this will work, there's no guarantee that we'll ever
>>> succeed and now we can't free a 2MB hugepage because we cannot allocate a
>>> 4KB page.  We absolutely have to ensure we make forward progress here.
>>
>> This can trigger a OOM when there is no memory and kill someone to release
>> some memory. Right?
>>
>>>
>>> We're going to be freeing the hugetlb page after this succeeeds, can we
>>> not use part of the hugetlb page that we're freeing for this memory
>>> instead?
>>
>> It seems a good idea. We can try to allocate memory firstly, if successful,
>> just use the new page to remap (it can reduce memory fragmentation).
>> If not, we can use part of the hugetlb page to remap. What's your opinion
>> about this?
> 
> If the HugeTLB page is a gigantic page which is allocated from
> CMA. In this case, we cannot use part of the hugetlb page to remap.
> Right?

Right; and I don't think the "reuse part of a huge page as vmemmap while
freeing, while that part itself might not have a proper vmemmap yet (or
might cover itself now)" is particularly straight forward. Maybe I'm
wrong :)

Also, watch out for huge pages on ZONE_MOVABLE, in that case you also
shouldn't allocate the vmemmap from there ...
Muchun Song Jan. 25, 2021, 9:34 a.m. UTC | #5
On Mon, Jan 25, 2021 at 5:15 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 25.01.21 08:41, Muchun Song wrote:
> > On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote:
> >>
> >> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote:
> >>>
> >>>
> >>> On Sun, 17 Jan 2021, Muchun Song wrote:
> >>>
> >>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> >>>> index ce4be1fa93c2..3b146d5949f3 100644
> >>>> --- a/mm/sparse-vmemmap.c
> >>>> +++ b/mm/sparse-vmemmap.c
> >>>> @@ -29,6 +29,7 @@
> >>>>  #include <linux/sched.h>
> >>>>  #include <linux/pgtable.h>
> >>>>  #include <linux/bootmem_info.h>
> >>>> +#include <linux/delay.h>
> >>>>
> >>>>  #include <asm/dma.h>
> >>>>  #include <asm/pgalloc.h>
> >>>> @@ -40,7 +41,8 @@
> >>>>   * @remap_pte:               called for each non-empty PTE (lowest-level) entry.
> >>>>   * @reuse_page:              the page which is reused for the tail vmemmap pages.
> >>>>   * @reuse_addr:              the virtual address of the @reuse_page page.
> >>>> - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
> >>>> + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
> >>>> + *                   or is mapped from.
> >>>>   */
> >>>>  struct vmemmap_remap_walk {
> >>>>       void (*remap_pte)(pte_t *pte, unsigned long addr,
> >>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
> >>>>       struct list_head *vmemmap_pages;
> >>>>  };
> >>>>
> >>>> +/* The gfp mask of allocating vmemmap page */
> >>>> +#define GFP_VMEMMAP_PAGE             \
> >>>> +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> >>>> +
> >>>
> >>> This is unnecessary, just use the gfp mask directly in allocator.
> >>
> >> Will do. Thanks.
> >>
> >>>
> >>>>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
> >>>>                             unsigned long end,
> >>>>                             struct vmemmap_remap_walk *walk)
> >>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
> >>>>       free_vmemmap_page_list(&vmemmap_pages);
> >>>>  }
> >>>>
> >>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> >>>> +                             struct vmemmap_remap_walk *walk)
> >>>> +{
> >>>> +     pgprot_t pgprot = PAGE_KERNEL;
> >>>> +     struct page *page;
> >>>> +     void *to;
> >>>> +
> >>>> +     BUG_ON(pte_page(*pte) != walk->reuse_page);
> >>>> +
> >>>> +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> >>>> +     list_del(&page->lru);
> >>>> +     to = page_to_virt(page);
> >>>> +     copy_page(to, (void *)walk->reuse_addr);
> >>>> +
> >>>> +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> >>>> +}
> >>>> +
> >>>> +static void alloc_vmemmap_page_list(struct list_head *list,
> >>>> +                                 unsigned long start, unsigned long end)
> >>>> +{
> >>>> +     unsigned long addr;
> >>>> +
> >>>> +     for (addr = start; addr < end; addr += PAGE_SIZE) {
> >>>> +             struct page *page;
> >>>> +             int nid = page_to_nid((const void *)addr);
> >>>> +
> >>>> +retry:
> >>>> +             page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> >>>> +             if (unlikely(!page)) {
> >>>> +                     msleep(100);
> >>>> +                     /*
> >>>> +                      * We should retry infinitely, because we cannot
> >>>> +                      * handle allocation failures. Once we allocate
> >>>> +                      * vmemmap pages successfully, then we can free
> >>>> +                      * a HugeTLB page.
> >>>> +                      */
> >>>> +                     goto retry;
> >>>
> >>> Ugh, I don't think this will work, there's no guarantee that we'll ever
> >>> succeed and now we can't free a 2MB hugepage because we cannot allocate a
> >>> 4KB page.  We absolutely have to ensure we make forward progress here.
> >>
> >> This can trigger a OOM when there is no memory and kill someone to release
> >> some memory. Right?
> >>
> >>>
> >>> We're going to be freeing the hugetlb page after this succeeeds, can we
> >>> not use part of the hugetlb page that we're freeing for this memory
> >>> instead?
> >>
> >> It seems a good idea. We can try to allocate memory firstly, if successful,
> >> just use the new page to remap (it can reduce memory fragmentation).
> >> If not, we can use part of the hugetlb page to remap. What's your opinion
> >> about this?
> >
> > If the HugeTLB page is a gigantic page which is allocated from
> > CMA. In this case, we cannot use part of the hugetlb page to remap.
> > Right?
>
> Right; and I don't think the "reuse part of a huge page as vmemmap while
> freeing, while that part itself might not have a proper vmemmap yet (or
> might cover itself now)" is particularly straight forward. Maybe I'm
> wrong :)
>
> Also, watch out for huge pages on ZONE_MOVABLE, in that case you also
> shouldn't allocate the vmemmap from there ...

Yeah, you are right. So I tend to trigger OOM to kill other processes to
reclaim some memory when we allocate memory fails.

>
> --
> Thanks,
>
> David / dhildenb
>
Mike Kravetz Jan. 25, 2021, 11:25 p.m. UTC | #6
On 1/25/21 1:34 AM, Muchun Song wrote:
> On Mon, Jan 25, 2021 at 5:15 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 25.01.21 08:41, Muchun Song wrote:
>>> On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote:
>>>>
>>>> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote:
>>>>>
>>>>>
>>>>> On Sun, 17 Jan 2021, Muchun Song wrote:
>>>>>
>>>>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>>>>>> index ce4be1fa93c2..3b146d5949f3 100644
>>>>>> --- a/mm/sparse-vmemmap.c
>>>>>> +++ b/mm/sparse-vmemmap.c
>>>>>> @@ -29,6 +29,7 @@
>>>>>>  #include <linux/sched.h>
>>>>>>  #include <linux/pgtable.h>
>>>>>>  #include <linux/bootmem_info.h>
>>>>>> +#include <linux/delay.h>
>>>>>>
>>>>>>  #include <asm/dma.h>
>>>>>>  #include <asm/pgalloc.h>
>>>>>> @@ -40,7 +41,8 @@
>>>>>>   * @remap_pte:               called for each non-empty PTE (lowest-level) entry.
>>>>>>   * @reuse_page:              the page which is reused for the tail vmemmap pages.
>>>>>>   * @reuse_addr:              the virtual address of the @reuse_page page.
>>>>>> - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
>>>>>> + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
>>>>>> + *                   or is mapped from.
>>>>>>   */
>>>>>>  struct vmemmap_remap_walk {
>>>>>>       void (*remap_pte)(pte_t *pte, unsigned long addr,
>>>>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
>>>>>>       struct list_head *vmemmap_pages;
>>>>>>  };
>>>>>>
>>>>>> +/* The gfp mask of allocating vmemmap page */
>>>>>> +#define GFP_VMEMMAP_PAGE             \
>>>>>> +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
>>>>>> +
>>>>>
>>>>> This is unnecessary, just use the gfp mask directly in allocator.
>>>>
>>>> Will do. Thanks.
>>>>
>>>>>
>>>>>>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>>>>>>                             unsigned long end,
>>>>>>                             struct vmemmap_remap_walk *walk)
>>>>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>>>>>>       free_vmemmap_page_list(&vmemmap_pages);
>>>>>>  }
>>>>>>
>>>>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>>>> +                             struct vmemmap_remap_walk *walk)
>>>>>> +{
>>>>>> +     pgprot_t pgprot = PAGE_KERNEL;
>>>>>> +     struct page *page;
>>>>>> +     void *to;
>>>>>> +
>>>>>> +     BUG_ON(pte_page(*pte) != walk->reuse_page);
>>>>>> +
>>>>>> +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
>>>>>> +     list_del(&page->lru);
>>>>>> +     to = page_to_virt(page);
>>>>>> +     copy_page(to, (void *)walk->reuse_addr);
>>>>>> +
>>>>>> +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>>>>> +}
>>>>>> +
>>>>>> +static void alloc_vmemmap_page_list(struct list_head *list,
>>>>>> +                                 unsigned long start, unsigned long end)
>>>>>> +{
>>>>>> +     unsigned long addr;
>>>>>> +
>>>>>> +     for (addr = start; addr < end; addr += PAGE_SIZE) {
>>>>>> +             struct page *page;
>>>>>> +             int nid = page_to_nid((const void *)addr);
>>>>>> +
>>>>>> +retry:
>>>>>> +             page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
>>>>>> +             if (unlikely(!page)) {
>>>>>> +                     msleep(100);
>>>>>> +                     /*
>>>>>> +                      * We should retry infinitely, because we cannot
>>>>>> +                      * handle allocation failures. Once we allocate
>>>>>> +                      * vmemmap pages successfully, then we can free
>>>>>> +                      * a HugeTLB page.
>>>>>> +                      */
>>>>>> +                     goto retry;
>>>>>
>>>>> Ugh, I don't think this will work, there's no guarantee that we'll ever
>>>>> succeed and now we can't free a 2MB hugepage because we cannot allocate a
>>>>> 4KB page.  We absolutely have to ensure we make forward progress here.
>>>>
>>>> This can trigger a OOM when there is no memory and kill someone to release
>>>> some memory. Right?
>>>>
>>>>>
>>>>> We're going to be freeing the hugetlb page after this succeeeds, can we
>>>>> not use part of the hugetlb page that we're freeing for this memory
>>>>> instead?
>>>>
>>>> It seems a good idea. We can try to allocate memory firstly, if successful,
>>>> just use the new page to remap (it can reduce memory fragmentation).
>>>> If not, we can use part of the hugetlb page to remap. What's your opinion
>>>> about this?
>>>
>>> If the HugeTLB page is a gigantic page which is allocated from
>>> CMA. In this case, we cannot use part of the hugetlb page to remap.
>>> Right?
>>
>> Right; and I don't think the "reuse part of a huge page as vmemmap while
>> freeing, while that part itself might not have a proper vmemmap yet (or
>> might cover itself now)" is particularly straight forward. Maybe I'm
>> wrong :)
>>
>> Also, watch out for huge pages on ZONE_MOVABLE, in that case you also
>> shouldn't allocate the vmemmap from there ...
> 
> Yeah, you are right. So I tend to trigger OOM to kill other processes to
> reclaim some memory when we allocate memory fails.

IIUC, even non-gigantic hugetlb pages can exist in CMA.  They can be migrated
out of CMA if needed (except free pages in the pool, but that is a separate
issue David H already noted in another thread).

When we first started discussing this patch set, one suggestion was to force
hugetlb pool pages to be allocated at boot time and never permit them to be
freed back to the buddy allocator.  A primary reason for the suggestion was
to avoid this issue of needing to allocate memory when freeing a hugetlb page
to buddy.  IMO, that would be an unreasonable restriction for many existing
hugetlb use cases.

A simple thought is that we simply fail the 'freeing hugetlb page to buddy'
if we can not allocate the required vmemmap pages.  However, as David R says
freeing hugetlb pages to buddy is a reasonable way to free up memory in oom
situations.  However, failing the operation 'might' be better than looping
forever trying to allocate the pages needed?  As mentioned in the previous
patch, it would be better to use GFP_ATOMIC to at least dip into reserves if
we can.

I think using pages of the hugetlb for vmemmap to cover pages of the hugetlb
is the only way we can guarantee success of freeing a hugetlb page to buddy.
However, this should only only be used when there is no other option and could
result in vmemmap pages residing in CMA or ZONE_MOVABLE.  I'm not sure how
much better this is than failing the free to buddy operation.

I don't have a solution.  Just wanted to share some thoughts.

BTW, just thought of something else.  Consider offlining a memory section that
contains a free hugetlb page.  The offline code will try to disolve the hugetlb
page (free to buddy).  So, vmemmap pages will need to be allocated.  We will
try to allocate vmemap pages on the same node as the hugetlb page.  But, if
this memory section is the last of the node all the pages will have been
isolated and no allocations will succeed.  Is that a possible scenario, or am
I just having too many negative thoughts?
Oscar Salvador Jan. 26, 2021, 7:48 a.m. UTC | #7
On Mon, Jan 25, 2021 at 03:25:35PM -0800, Mike Kravetz wrote:
> IIUC, even non-gigantic hugetlb pages can exist in CMA.  They can be migrated
> out of CMA if needed (except free pages in the pool, but that is a separate
> issue David H already noted in another thread).

Yeah, as discussed I am taking a look at that.
 
> When we first started discussing this patch set, one suggestion was to force
> hugetlb pool pages to be allocated at boot time and never permit them to be
> freed back to the buddy allocator.  A primary reason for the suggestion was
> to avoid this issue of needing to allocate memory when freeing a hugetlb page
> to buddy.  IMO, that would be an unreasonable restriction for many existing
> hugetlb use cases.

AFAIK it was suggested as a way to simplify things in the first go of this
patchset.
Please note that the first versions of this patchset was dealing with PMD
mapped vmemmap pages and overall it was quite convulated for a first
version.
Since then, things had simplified quite a lot (e.g: we went from 22 patches to 12),
so I do not feel the need to force the pages to be allocated at boot time.

> A simple thought is that we simply fail the 'freeing hugetlb page to buddy'
> if we can not allocate the required vmemmap pages.  However, as David R says
> freeing hugetlb pages to buddy is a reasonable way to free up memory in oom
> situations.  However, failing the operation 'might' be better than looping
> forever trying to allocate the pages needed?  As mentioned in the previous
> patch, it would be better to use GFP_ATOMIC to at least dip into reserves if
> we can.

I also agree that GFP_ATOMIC might make some sense.
If the system is under memory pressure, I think it is best if we go the extra
mile in order to free up to 4096 pages or 512 pages.
Otherwise we might have a nice hugetlb page we might not need and a lack of
memory.

> I think using pages of the hugetlb for vmemmap to cover pages of the hugetlb
> is the only way we can guarantee success of freeing a hugetlb page to buddy.
> However, this should only only be used when there is no other option and could
> result in vmemmap pages residing in CMA or ZONE_MOVABLE.  I'm not sure how
> much better this is than failing the free to buddy operation.

And how would you tell when there is no other option?

> I don't have a solution.  Just wanted to share some thoughts.
> 
> BTW, just thought of something else.  Consider offlining a memory section that
> contains a free hugetlb page.  The offline code will try to disolve the hugetlb
> page (free to buddy).  So, vmemmap pages will need to be allocated.  We will
> try to allocate vmemap pages on the same node as the hugetlb page.  But, if
> this memory section is the last of the node all the pages will have been
> isolated and no allocations will succeed.  Is that a possible scenario, or am
> I just having too many negative thoughts?

IIUC, GFP_ATOMIC will reset ALLOC_CPUSET flags at some point and the nodemask will
be cleared, so I guess the system will try to allocate from another node.
But I am not sure about that one.

I would like to hear Michal's thoughts on this.
Oscar Salvador Jan. 26, 2021, 9:29 a.m. UTC | #8
On Sun, Jan 17, 2021 at 11:10:46PM +0800, Muchun Song wrote:
> When we free a HugeTLB page to the buddy allocator, we should allocate the
> vmemmap pages associated with it. We can do that in the __free_hugepage()
> before freeing it to buddy.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>

This series has grown a certain grade of madurity and improvment, but it seems
to me that we have been stuck in this patch (and patch#4) for quite some time.

Would it be acceptable for a first implementation to not let hugetlb pages to
be freed when this feature is in use?
This would simplify things for now, as we could get rid of patch#4 and patch#5.
We can always extend functionality once this has been merged, right?

Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work
when this in place, but well.

I would like to hear what others think, but in my opinion it would be a big step
to move on.



> ---
>  include/linux/mm.h   |  2 ++
>  mm/hugetlb.c         |  2 ++
>  mm/hugetlb_vmemmap.c | 15 ++++++++++
>  mm/hugetlb_vmemmap.h |  5 ++++
>  mm/sparse-vmemmap.c  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 100 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f928994ed273..16b55d13b0ab 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3007,6 +3007,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
>  
>  void vmemmap_remap_free(unsigned long start, unsigned long end,
>  			unsigned long reuse);
> +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +			 unsigned long reuse);
>  
>  void *sparse_buffer_alloc(unsigned long size);
>  struct page * __populate_section_memmap(unsigned long pfn,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index c165186ec2cf..d11c32fcdb38 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1326,6 +1326,8 @@ static void update_hpage_vmemmap_workfn(struct work_struct *work)
>  		page->mapping = NULL;
>  		h = page_hstate(page);
>  
> +		alloc_huge_page_vmemmap(h, page);
> +
>  		spin_lock(&hugetlb_lock);
>  		__free_hugepage(h, page);
>  		spin_unlock(&hugetlb_lock);
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 19f1898aaede..6108ae80314f 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -183,6 +183,21 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
>  	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
>  }
>  
> +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
> +{
> +	unsigned long vmemmap_addr = (unsigned long)head;
> +	unsigned long vmemmap_end, vmemmap_reuse;
> +
> +	if (!free_vmemmap_pages_per_hpage(h))
> +		return;
> +
> +	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> +	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
> +	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
> +
> +	vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse);
> +}
> +
>  void free_huge_page_vmemmap(struct hstate *h, struct page *head)
>  {
>  	unsigned long vmemmap_addr = (unsigned long)head;
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 01f8637adbe0..b2c8d2f11d48 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -11,6 +11,7 @@
>  #include <linux/hugetlb.h>
>  
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
>  void free_huge_page_vmemmap(struct hstate *h, struct page *head);
>  
>  /*
> @@ -25,6 +26,10 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
>  	return 0;
>  }
>  #else
> +static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
> +{
> +}
> +
>  static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
>  {
>  }
> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index ce4be1fa93c2..3b146d5949f3 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched.h>
>  #include <linux/pgtable.h>
>  #include <linux/bootmem_info.h>
> +#include <linux/delay.h>
>  
>  #include <asm/dma.h>
>  #include <asm/pgalloc.h>
> @@ -40,7 +41,8 @@
>   * @remap_pte:		called for each non-empty PTE (lowest-level) entry.
>   * @reuse_page:		the page which is reused for the tail vmemmap pages.
>   * @reuse_addr:		the virtual address of the @reuse_page page.
> - * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
> + * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
> + *			or is mapped from.
>   */
>  struct vmemmap_remap_walk {
>  	void (*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
>  	struct list_head *vmemmap_pages;
>  };
>  
> +/* The gfp mask of allocating vmemmap page */
> +#define GFP_VMEMMAP_PAGE		\
> +	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> +
>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  			      unsigned long end,
>  			      struct vmemmap_remap_walk *walk)
> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>  	free_vmemmap_page_list(&vmemmap_pages);
>  }
>  
> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> +				struct vmemmap_remap_walk *walk)
> +{
> +	pgprot_t pgprot = PAGE_KERNEL;
> +	struct page *page;
> +	void *to;
> +
> +	BUG_ON(pte_page(*pte) != walk->reuse_page);
> +
> +	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> +	list_del(&page->lru);
> +	to = page_to_virt(page);
> +	copy_page(to, (void *)walk->reuse_addr);
> +
> +	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +}
> +
> +static void alloc_vmemmap_page_list(struct list_head *list,
> +				    unsigned long start, unsigned long end)
> +{
> +	unsigned long addr;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE) {
> +		struct page *page;
> +		int nid = page_to_nid((const void *)addr);
> +
> +retry:
> +		page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> +		if (unlikely(!page)) {
> +			msleep(100);
> +			/*
> +			 * We should retry infinitely, because we cannot
> +			 * handle allocation failures. Once we allocate
> +			 * vmemmap pages successfully, then we can free
> +			 * a HugeTLB page.
> +			 */
> +			goto retry;
> +		}
> +		list_add_tail(&page->lru, list);
> +	}
> +}
> +
> +/**
> + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> + *			 to the page which is from the @vmemmap_pages
> + *			 respectively.
> + * @start:	start address of the vmemmap virtual address range.
> + * @end:	end address of the vmemmap virtual address range.
> + * @reuse:	reuse address.
> + */
> +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +			 unsigned long reuse)
> +{
> +	LIST_HEAD(vmemmap_pages);
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= vmemmap_restore_pte,
> +		.reuse_addr	= reuse,
> +		.vmemmap_pages	= &vmemmap_pages,
> +	};
> +
> +	might_sleep();
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	alloc_vmemmap_page_list(&vmemmap_pages, start, end);
> +	vmemmap_remap_range(reuse, end, &walk);
> +}
> +
>  /*
>   * Allocate a block of memory to be used to back the virtual memory map
>   * or to back the page tables that are used to create the mapping.
> -- 
> 2.11.0
> 
>
David Hildenbrand Jan. 26, 2021, 9:36 a.m. UTC | #9
On 26.01.21 10:29, Oscar Salvador wrote:
> On Sun, Jan 17, 2021 at 11:10:46PM +0800, Muchun Song wrote:
>> When we free a HugeTLB page to the buddy allocator, we should allocate the
>> vmemmap pages associated with it. We can do that in the __free_hugepage()
>> before freeing it to buddy.
>>
>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> 
> This series has grown a certain grade of madurity and improvment, but it seems
> to me that we have been stuck in this patch (and patch#4) for quite some time.
> 
> Would it be acceptable for a first implementation to not let hugetlb pages to
> be freed when this feature is in use?
> This would simplify things for now, as we could get rid of patch#4 and patch#5.
> We can always extend functionality once this has been merged, right?

I think either keep it completely simple (only free vmemmap of hugetlb
pages allocated early during boot - which is what's not sufficient for
some use cases) or implement the full thing properly (meaning, solve
most challenging issues to get the basics running).

I don't want to have some easy parts of complex features merged (e.g.,
breaking other stuff as you indicate below), and later finding out "it's
not that easy" again and being stuck with it forever.

> 
> Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work
> when this in place, but well.

Can you elaborate? Are we're talking about having hugepages in
ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than
a clear NACK from my side.
Oscar Salvador Jan. 26, 2021, 2:58 p.m. UTC | #10
On Tue, Jan 26, 2021 at 10:36:21AM +0100, David Hildenbrand wrote:
> I think either keep it completely simple (only free vmemmap of hugetlb
> pages allocated early during boot - which is what's not sufficient for
> some use cases) or implement the full thing properly (meaning, solve
> most challenging issues to get the basics running).
> 
> I don't want to have some easy parts of complex features merged (e.g.,
> breaking other stuff as you indicate below), and later finding out "it's
> not that easy" again and being stuck with it forever.

Well, we could try to do an optimistic allocation, without tricky loopings.
If that fails, refuse to shrink the pool at that moment.

The user could always try to shrink it later via /proc/sys/vm/nr_hugepages
interface.

But I am just thinking out loud..

> > Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work
> > when this in place, but well.
> 
> Can you elaborate? Are we're talking about having hugepages in
> ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than
> a clear NACK from my side.

Pretty much, yeah.
David Hildenbrand Jan. 26, 2021, 3:10 p.m. UTC | #11
On 26.01.21 15:58, Oscar Salvador wrote:
> On Tue, Jan 26, 2021 at 10:36:21AM +0100, David Hildenbrand wrote:
>> I think either keep it completely simple (only free vmemmap of hugetlb
>> pages allocated early during boot - which is what's not sufficient for
>> some use cases) or implement the full thing properly (meaning, solve
>> most challenging issues to get the basics running).
>>
>> I don't want to have some easy parts of complex features merged (e.g.,
>> breaking other stuff as you indicate below), and later finding out "it's
>> not that easy" again and being stuck with it forever.
> 
> Well, we could try to do an optimistic allocation, without tricky loopings.
> If that fails, refuse to shrink the pool at that moment.
> 
> The user could always try to shrink it later via /proc/sys/vm/nr_hugepages
> interface.
> 
> But I am just thinking out loud..

The real issue seems to be discarding the vmemmap on any memory that has 
movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, 
we can reuse parts of the thingy we're freeing for the vmemmap. Not that 
it would be ideal: that once-a-huge-page thing will never ever be a huge 
page again - but if it helps with OOM in corner cases, sure.

Possible simplification: don't perform the optimization for now with 
free huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: 
what happens when migrating a huge page from ZONE_NORMAL to 
(ZONE_MOVABLE|CMA)?

> 
>>> Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work
>>> when this in place, but well.
>>
>> Can you elaborate? Are we're talking about having hugepages in
>> ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than
>> a clear NACK from my side.
> 
> Pretty much, yeah.

Note that we most likely soon have to tackle migrating/dissolving (free) 
hugetlbfs pages from alloc_contig_range() context - e.g., for CMA 
allocations. That's certainly something to keep in mind regarding any 
approaches that already break offline_pages().
Oscar Salvador Jan. 26, 2021, 3:34 p.m. UTC | #12
On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
> The real issue seems to be discarding the vmemmap on any memory that has
> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
> would be ideal: that once-a-huge-page thing will never ever be a huge page
> again - but if it helps with OOM in corner cases, sure.

Yes, that is one way, but I am not sure how hard would it be to implement.
Plus the fact that as you pointed out, once that memory is used for vmemmap
array, we cannot use it again.
Actually, we would fragment the memory eventually?

> Possible simplification: don't perform the optimization for now with free
> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?

But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
point in migrate them, right?

> > > > Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work
> > > > when this in place, but well.
> > > 
> > > Can you elaborate? Are we're talking about having hugepages in
> > > ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than
> > > a clear NACK from my side.
> > 
> > Pretty much, yeah.
> 
> Note that we most likely soon have to tackle migrating/dissolving (free)
> hugetlbfs pages from alloc_contig_range() context - e.g., for CMA
> allocations. That's certainly something to keep in mind regarding any
> approaches that already break offline_pages().

Definitely. I already talked to Mike about that and I am going to have
a look into it pretty soon.
David Hildenbrand Jan. 26, 2021, 3:56 p.m. UTC | #13
On 26.01.21 16:34, Oscar Salvador wrote:
> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
>> The real issue seems to be discarding the vmemmap on any memory that has
>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
>> would be ideal: that once-a-huge-page thing will never ever be a huge page
>> again - but if it helps with OOM in corner cases, sure.
> 
> Yes, that is one way, but I am not sure how hard would it be to implement.
> Plus the fact that as you pointed out, once that memory is used for vmemmap
> array, we cannot use it again.
> Actually, we would fragment the memory eventually?
> 
>> Possible simplification: don't perform the optimization for now with free
>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
> 
> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
> point in migrate them, right?

Well, memory unplug "could" still work and migrate them and 
alloc_contig_range() "could in the future" still want to migrate them 
(virtio-mem, gigantic pages, powernv memtrace). Especially, the latter 
two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair 
enough to say "there are no guarantees for 
alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break 
these use cases when a magic switch is flipped and make these pages 
non-migratable anymore".

I assume compaction doesn't care about huge pages either way, not sure 
about numa balancing etc.


However, note that there is a fundamental issue with any approach that 
allocates a significant amount of unmovable memory for user-space 
purposes (excluding CMA allocations for unmovable stuff, CMA is 
special): pairing it with ZONE_MOVABLE becomes very tricky as your user 
space might just end up eating all kernel memory, although the system 
still looks like there is plenty of free memory residing in 
ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced 
form as well.

We theoretically have that issue with dynamic allocation of gigantic 
pages, but it's something a user explicitly/rarely triggers and it can 
be documented to cause problems well enough. We'll have the same issue 
with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is 
already known to be broken in various ways and that it has to be treated 
in a special way. I'd like to limit the nasty corner cases.

Of course, we could have smart rules like "don't online memory to 
ZONE_MOVABLE automatically when the magic switch is active". That's just 
ugly, but could work.
David Hildenbrand Jan. 27, 2021, 10:36 a.m. UTC | #14
On 26.01.21 16:56, David Hildenbrand wrote:
> On 26.01.21 16:34, Oscar Salvador wrote:
>> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
>>> The real issue seems to be discarding the vmemmap on any memory that has
>>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
>>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
>>> would be ideal: that once-a-huge-page thing will never ever be a huge page
>>> again - but if it helps with OOM in corner cases, sure.
>>
>> Yes, that is one way, but I am not sure how hard would it be to implement.
>> Plus the fact that as you pointed out, once that memory is used for vmemmap
>> array, we cannot use it again.
>> Actually, we would fragment the memory eventually?
>>
>>> Possible simplification: don't perform the optimization for now with free
>>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
>>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
>>
>> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
>> point in migrate them, right?
> 
> Well, memory unplug "could" still work and migrate them and
> alloc_contig_range() "could in the future" still want to migrate them
> (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter
> two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair
> enough to say "there are no guarantees for
> alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break
> these use cases when a magic switch is flipped and make these pages
> non-migratable anymore".
> 
> I assume compaction doesn't care about huge pages either way, not sure
> about numa balancing etc.
> 
> 
> However, note that there is a fundamental issue with any approach that
> allocates a significant amount of unmovable memory for user-space
> purposes (excluding CMA allocations for unmovable stuff, CMA is
> special): pairing it with ZONE_MOVABLE becomes very tricky as your user
> space might just end up eating all kernel memory, although the system
> still looks like there is plenty of free memory residing in
> ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced
> form as well.
> 
> We theoretically have that issue with dynamic allocation of gigantic
> pages, but it's something a user explicitly/rarely triggers and it can
> be documented to cause problems well enough. We'll have the same issue
> with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is
> already known to be broken in various ways and that it has to be treated
> in a special way. I'd like to limit the nasty corner cases.
> 
> Of course, we could have smart rules like "don't online memory to
> ZONE_MOVABLE automatically when the magic switch is active". That's just
> ugly, but could work.
> 

Extending on that, I just discovered that only x86-64, ppc64, and arm64 
really support hugepage migration.

Maybe one approach with the "magic switch" really would be to disable 
hugepage migration completely in hugepage_migration_supported(), and 
consequently making hugepage_movable_supported() always return false.

Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be 
migrated. The problem I describe would apply (careful with using 
ZONE_MOVABLE), but well, it can at least be documented.
Muchun Song Jan. 28, 2021, 12:37 p.m. UTC | #15
On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 26.01.21 16:56, David Hildenbrand wrote:
> > On 26.01.21 16:34, Oscar Salvador wrote:
> >> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
> >>> The real issue seems to be discarding the vmemmap on any memory that has
> >>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
> >>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
> >>> would be ideal: that once-a-huge-page thing will never ever be a huge page
> >>> again - but if it helps with OOM in corner cases, sure.
> >>
> >> Yes, that is one way, but I am not sure how hard would it be to implement.
> >> Plus the fact that as you pointed out, once that memory is used for vmemmap
> >> array, we cannot use it again.
> >> Actually, we would fragment the memory eventually?
> >>
> >>> Possible simplification: don't perform the optimization for now with free
> >>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
> >>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
> >>
> >> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
> >> point in migrate them, right?
> >
> > Well, memory unplug "could" still work and migrate them and
> > alloc_contig_range() "could in the future" still want to migrate them
> > (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter
> > two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair
> > enough to say "there are no guarantees for
> > alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break
> > these use cases when a magic switch is flipped and make these pages
> > non-migratable anymore".
> >
> > I assume compaction doesn't care about huge pages either way, not sure
> > about numa balancing etc.
> >
> >
> > However, note that there is a fundamental issue with any approach that
> > allocates a significant amount of unmovable memory for user-space
> > purposes (excluding CMA allocations for unmovable stuff, CMA is
> > special): pairing it with ZONE_MOVABLE becomes very tricky as your user
> > space might just end up eating all kernel memory, although the system
> > still looks like there is plenty of free memory residing in
> > ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced
> > form as well.
> >
> > We theoretically have that issue with dynamic allocation of gigantic
> > pages, but it's something a user explicitly/rarely triggers and it can
> > be documented to cause problems well enough. We'll have the same issue
> > with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is
> > already known to be broken in various ways and that it has to be treated
> > in a special way. I'd like to limit the nasty corner cases.
> >
> > Of course, we could have smart rules like "don't online memory to
> > ZONE_MOVABLE automatically when the magic switch is active". That's just
> > ugly, but could work.
> >
>
> Extending on that, I just discovered that only x86-64, ppc64, and arm64
> really support hugepage migration.
>
> Maybe one approach with the "magic switch" really would be to disable
> hugepage migration completely in hugepage_migration_supported(), and
> consequently making hugepage_movable_supported() always return false.
>
> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
> migrated. The problem I describe would apply (careful with using
> ZONE_MOVABLE), but well, it can at least be documented.

Thanks for your explanation.

All thinking seems to be introduced by encountering OOM. :-(

In order to move forward and free the hugepage. We should add some
restrictions below.

1. Only free the hugepage which is allocated from the ZONE_NORMAL.
2. Disable hugepage migration when this feature is enabled.
3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce
   memory fragmentation), if it fails, we use part of the hugepage to
   remap.

Hi Oscar, Mike and David H

What's your opinion about this? Should we take this approach?

Thanks.

>
> --
> Thanks,
>
> David / dhildenb
>
Muchun Song Jan. 28, 2021, 1:08 p.m. UTC | #16
On Thu, Jan 28, 2021 at 8:37 PM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 26.01.21 16:56, David Hildenbrand wrote:
> > > On 26.01.21 16:34, Oscar Salvador wrote:
> > >> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
> > >>> The real issue seems to be discarding the vmemmap on any memory that has
> > >>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
> > >>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
> > >>> would be ideal: that once-a-huge-page thing will never ever be a huge page
> > >>> again - but if it helps with OOM in corner cases, sure.
> > >>
> > >> Yes, that is one way, but I am not sure how hard would it be to implement.
> > >> Plus the fact that as you pointed out, once that memory is used for vmemmap
> > >> array, we cannot use it again.
> > >> Actually, we would fragment the memory eventually?
> > >>
> > >>> Possible simplification: don't perform the optimization for now with free
> > >>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
> > >>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
> > >>
> > >> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
> > >> point in migrate them, right?
> > >
> > > Well, memory unplug "could" still work and migrate them and
> > > alloc_contig_range() "could in the future" still want to migrate them
> > > (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter
> > > two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair
> > > enough to say "there are no guarantees for
> > > alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break
> > > these use cases when a magic switch is flipped and make these pages
> > > non-migratable anymore".
> > >
> > > I assume compaction doesn't care about huge pages either way, not sure
> > > about numa balancing etc.
> > >
> > >
> > > However, note that there is a fundamental issue with any approach that
> > > allocates a significant amount of unmovable memory for user-space
> > > purposes (excluding CMA allocations for unmovable stuff, CMA is
> > > special): pairing it with ZONE_MOVABLE becomes very tricky as your user
> > > space might just end up eating all kernel memory, although the system
> > > still looks like there is plenty of free memory residing in
> > > ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced
> > > form as well.
> > >
> > > We theoretically have that issue with dynamic allocation of gigantic
> > > pages, but it's something a user explicitly/rarely triggers and it can
> > > be documented to cause problems well enough. We'll have the same issue
> > > with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is
> > > already known to be broken in various ways and that it has to be treated
> > > in a special way. I'd like to limit the nasty corner cases.
> > >
> > > Of course, we could have smart rules like "don't online memory to
> > > ZONE_MOVABLE automatically when the magic switch is active". That's just
> > > ugly, but could work.
> > >
> >
> > Extending on that, I just discovered that only x86-64, ppc64, and arm64
> > really support hugepage migration.
> >
> > Maybe one approach with the "magic switch" really would be to disable
> > hugepage migration completely in hugepage_migration_supported(), and
> > consequently making hugepage_movable_supported() always return false.
> >
> > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
> > migrated. The problem I describe would apply (careful with using
> > ZONE_MOVABLE), but well, it can at least be documented.
>
> Thanks for your explanation.
>
> All thinking seems to be introduced by encountering OOM. :-(
>
> In order to move forward and free the hugepage. We should add some
> restrictions below.
>
> 1. Only free the hugepage which is allocated from the ZONE_NORMAL.
           ^^
Sorry. Here "free" should be "optimize".

> 2. Disable hugepage migration when this feature is enabled.
> 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce
>    memory fragmentation), if it fails, we use part of the hugepage to
>    remap.
>
> Hi Oscar, Mike and David H
>
> What's your opinion about this? Should we take this approach?
>
> Thanks.
>
> >
> > --
> > Thanks,
> >
> > David / dhildenb
> >
Oscar Salvador Jan. 28, 2021, 10:29 p.m. UTC | #17
On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote:
> Extending on that, I just discovered that only x86-64, ppc64, and arm64
> really support hugepage migration.
> 
> Maybe one approach with the "magic switch" really would be to disable
> hugepage migration completely in hugepage_migration_supported(), and
> consequently making hugepage_movable_supported() always return false.

Ok, so migration would not fork for these pages, and since them would
lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory.
Well, we really cannot unplug it unless the hugepage is not used
(it can be dissolved at least).

Now to the allocation-when-freeing.
Current implementation uses GFP_ATOMIC(or wants to use) + forever loop.
One of the problems I see with GFP_ATOMIC is that gives you access
to memory reserves, but there are more users using those reserves.
Then, worst-scenario case we need to allocate 16MB order-0 pages
to free up 1GB hugepage, so the question would be whether reserves
really scale to 16MB + more users accessing reserves.

As I said, if anything I would go for an optimistic allocation-try
, if we fail just refuse to shrink the pool.
User can always try to shrink it later again via /sys interface.

Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not
expected to be migratable, is that ok?

Using the hugepage for the vmemmap array was brought up several times,
but that would imply fragmenting memory over time.

All in all seems to be overly complicated (I might be wrong).


> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
> migrated. The problem I describe would apply (careful with using
> ZONE_MOVABLE), but well, it can at least be documented.

I am not a page allocator expert but cannot the allocation fallback
to ZONE_MOVABLE under memory shortage on other zones?
Mike Kravetz Jan. 29, 2021, 1:04 a.m. UTC | #18
On 1/28/21 4:37 AM, Muchun Song wrote:
> On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 26.01.21 16:56, David Hildenbrand wrote:
>>> On 26.01.21 16:34, Oscar Salvador wrote:
>>>> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
>>>>> The real issue seems to be discarding the vmemmap on any memory that has
>>>>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
>>>>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
>>>>> would be ideal: that once-a-huge-page thing will never ever be a huge page
>>>>> again - but if it helps with OOM in corner cases, sure.
>>>>
>>>> Yes, that is one way, but I am not sure how hard would it be to implement.
>>>> Plus the fact that as you pointed out, once that memory is used for vmemmap
>>>> array, we cannot use it again.
>>>> Actually, we would fragment the memory eventually?
>>>>
>>>>> Possible simplification: don't perform the optimization for now with free
>>>>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
>>>>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
>>>>
>>>> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
>>>> point in migrate them, right?
>>>
>>> Well, memory unplug "could" still work and migrate them and
>>> alloc_contig_range() "could in the future" still want to migrate them
>>> (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter
>>> two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair
>>> enough to say "there are no guarantees for
>>> alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break
>>> these use cases when a magic switch is flipped and make these pages
>>> non-migratable anymore".
>>>
>>> I assume compaction doesn't care about huge pages either way, not sure
>>> about numa balancing etc.
>>>
>>>
>>> However, note that there is a fundamental issue with any approach that
>>> allocates a significant amount of unmovable memory for user-space
>>> purposes (excluding CMA allocations for unmovable stuff, CMA is
>>> special): pairing it with ZONE_MOVABLE becomes very tricky as your user
>>> space might just end up eating all kernel memory, although the system
>>> still looks like there is plenty of free memory residing in
>>> ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced
>>> form as well.
>>>
>>> We theoretically have that issue with dynamic allocation of gigantic
>>> pages, but it's something a user explicitly/rarely triggers and it can
>>> be documented to cause problems well enough. We'll have the same issue
>>> with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is
>>> already known to be broken in various ways and that it has to be treated
>>> in a special way. I'd like to limit the nasty corner cases.
>>>
>>> Of course, we could have smart rules like "don't online memory to
>>> ZONE_MOVABLE automatically when the magic switch is active". That's just
>>> ugly, but could work.
>>>
>>
>> Extending on that, I just discovered that only x86-64, ppc64, and arm64
>> really support hugepage migration.
>>
>> Maybe one approach with the "magic switch" really would be to disable
>> hugepage migration completely in hugepage_migration_supported(), and
>> consequently making hugepage_movable_supported() always return false.
>>
>> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
>> migrated. The problem I describe would apply (careful with using
>> ZONE_MOVABLE), but well, it can at least be documented.
> 
> Thanks for your explanation.
> 
> All thinking seems to be introduced by encountering OOM. :-(

Yes.  Or, I think about it as the problem of not being able to dissolve (free
to buddy) a hugetlb page.  We can not dissolve because we can not allocate
vmemmap for all sumpages.

> In order to move forward and free the hugepage. We should add some
> restrictions below.
> 
> 1. Only free the hugepage which is allocated from the ZONE_NORMAL.
Corrected: Only vmemmap optimize hugepages in ZONE_NORMAL

> 2. Disable hugepage migration when this feature is enabled.

I am not sure if we want to fully disable migration.  I may be misunderstanding
but the thought was to prevent migration between some movability types.  It
seems we should be able to migrate form ZONE_NORMAL to ZONE_NORMAL.

Also, if we do allow huge pages without vmemmap optimization in MOVABLE or CMA
then we should allow those to be migrated to NORMAL?  Or is there a reason why
we should prevent that.

> 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce
>    memory fragmentation), if it fails, we use part of the hugepage to
>    remap.

I honestly am not sure about this.  This would only happen for pages in
NORMAL.  The only time using part of the huge page for vmemmap would help is
if we are trying to dissolve huge pages to free up memory for other uses.

> What's your opinion about this? Should we take this approach?

I think trying to solve all the issues that could happen as the result of
not being able to dissolve a hugetlb page has made this extremely complex.
I know this is something we need to address/solve.  We do not want to add
more unexpected behavior in corner cases.  However, I can not help but think
about similar issues today.  For example, if a huge page is in use in
ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today.
Correct?  We may need to allocate another huge page for the target of the
migration, and there is no guarantee we can do that.
Muchun Song Jan. 29, 2021, 6:16 a.m. UTC | #19
On Fri, Jan 29, 2021 at 6:29 AM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote:
> > Extending on that, I just discovered that only x86-64, ppc64, and arm64
> > really support hugepage migration.
> >
> > Maybe one approach with the "magic switch" really would be to disable
> > hugepage migration completely in hugepage_migration_supported(), and
> > consequently making hugepage_movable_supported() always return false.
>
> Ok, so migration would not fork for these pages, and since them would
> lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory.
> Well, we really cannot unplug it unless the hugepage is not used
> (it can be dissolved at least).
>
> Now to the allocation-when-freeing.
> Current implementation uses GFP_ATOMIC(or wants to use) + forever loop.
> One of the problems I see with GFP_ATOMIC is that gives you access
> to memory reserves, but there are more users using those reserves.
> Then, worst-scenario case we need to allocate 16MB order-0 pages
> to free up 1GB hugepage, so the question would be whether reserves
> really scale to 16MB + more users accessing reserves.
>
> As I said, if anything I would go for an optimistic allocation-try
> , if we fail just refuse to shrink the pool.
> User can always try to shrink it later again via /sys interface.

Yeah. It seems that this is the easy way to move on.

Thanks.

>
> Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not
> expected to be migratable, is that ok?
>
> Using the hugepage for the vmemmap array was brought up several times,
> but that would imply fragmenting memory over time.
>
> All in all seems to be overly complicated (I might be wrong).
>
>
> > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
> > migrated. The problem I describe would apply (careful with using
> > ZONE_MOVABLE), but well, it can at least be documented.
>
> I am not a page allocator expert but cannot the allocation fallback
> to ZONE_MOVABLE under memory shortage on other zones?
>
>
> --
> Oscar Salvador
> SUSE L3
Muchun Song Jan. 29, 2021, 6:56 a.m. UTC | #20
On Fri, Jan 29, 2021 at 9:04 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 1/28/21 4:37 AM, Muchun Song wrote:
> > On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 26.01.21 16:56, David Hildenbrand wrote:
> >>> On 26.01.21 16:34, Oscar Salvador wrote:
> >>>> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote:
> >>>>> The real issue seems to be discarding the vmemmap on any memory that has
> >>>>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we
> >>>>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it
> >>>>> would be ideal: that once-a-huge-page thing will never ever be a huge page
> >>>>> again - but if it helps with OOM in corner cases, sure.
> >>>>
> >>>> Yes, that is one way, but I am not sure how hard would it be to implement.
> >>>> Plus the fact that as you pointed out, once that memory is used for vmemmap
> >>>> array, we cannot use it again.
> >>>> Actually, we would fragment the memory eventually?
> >>>>
> >>>>> Possible simplification: don't perform the optimization for now with free
> >>>>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what
> >>>>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)?
> >>>>
> >>>> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no
> >>>> point in migrate them, right?
> >>>
> >>> Well, memory unplug "could" still work and migrate them and
> >>> alloc_contig_range() "could in the future" still want to migrate them
> >>> (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter
> >>> two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair
> >>> enough to say "there are no guarantees for
> >>> alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break
> >>> these use cases when a magic switch is flipped and make these pages
> >>> non-migratable anymore".
> >>>
> >>> I assume compaction doesn't care about huge pages either way, not sure
> >>> about numa balancing etc.
> >>>
> >>>
> >>> However, note that there is a fundamental issue with any approach that
> >>> allocates a significant amount of unmovable memory for user-space
> >>> purposes (excluding CMA allocations for unmovable stuff, CMA is
> >>> special): pairing it with ZONE_MOVABLE becomes very tricky as your user
> >>> space might just end up eating all kernel memory, although the system
> >>> still looks like there is plenty of free memory residing in
> >>> ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced
> >>> form as well.
> >>>
> >>> We theoretically have that issue with dynamic allocation of gigantic
> >>> pages, but it's something a user explicitly/rarely triggers and it can
> >>> be documented to cause problems well enough. We'll have the same issue
> >>> with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is
> >>> already known to be broken in various ways and that it has to be treated
> >>> in a special way. I'd like to limit the nasty corner cases.
> >>>
> >>> Of course, we could have smart rules like "don't online memory to
> >>> ZONE_MOVABLE automatically when the magic switch is active". That's just
> >>> ugly, but could work.
> >>>
> >>
> >> Extending on that, I just discovered that only x86-64, ppc64, and arm64
> >> really support hugepage migration.
> >>
> >> Maybe one approach with the "magic switch" really would be to disable
> >> hugepage migration completely in hugepage_migration_supported(), and
> >> consequently making hugepage_movable_supported() always return false.
> >>
> >> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
> >> migrated. The problem I describe would apply (careful with using
> >> ZONE_MOVABLE), but well, it can at least be documented.
> >
> > Thanks for your explanation.
> >
> > All thinking seems to be introduced by encountering OOM. :-(
>
> Yes.  Or, I think about it as the problem of not being able to dissolve (free
> to buddy) a hugetlb page.  We can not dissolve because we can not allocate
> vmemmap for all sumpages.
>
> > In order to move forward and free the hugepage. We should add some
> > restrictions below.
> >
> > 1. Only free the hugepage which is allocated from the ZONE_NORMAL.
> Corrected: Only vmemmap optimize hugepages in ZONE_NORMAL
>
> > 2. Disable hugepage migration when this feature is enabled.
>
> I am not sure if we want to fully disable migration.  I may be misunderstanding
> but the thought was to prevent migration between some movability types.  It
> seems we should be able to migrate form ZONE_NORMAL to ZONE_NORMAL.
>
> Also, if we do allow huge pages without vmemmap optimization in MOVABLE or CMA
> then we should allow those to be migrated to NORMAL?  Or is there a reason why
> we should prevent that.
>
> > 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce
> >    memory fragmentation), if it fails, we use part of the hugepage to
> >    remap.
>
> I honestly am not sure about this.  This would only happen for pages in
> NORMAL.  The only time using part of the huge page for vmemmap would help is
> if we are trying to dissolve huge pages to free up memory for other uses.
>
> > What's your opinion about this? Should we take this approach?
>
> I think trying to solve all the issues that could happen as the result of
> not being able to dissolve a hugetlb page has made this extremely complex.
> I know this is something we need to address/solve.  We do not want to add
> more unexpected behavior in corner cases.  However, I can not help but think
> about similar issues today.  For example, if a huge page is in use in
> ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today.
> Correct?  We may need to allocate another huge page for the target of the
> migration, and there is no guarantee we can do that.

Yeah. Adding more restrictions makes things more complex. As you
and Oscar said, refusing to free hugepage when allocating
vmemmap pages fail may be an easy way now.

> --
> Mike Kravetz
David Hildenbrand Feb. 1, 2021, 3:50 p.m. UTC | #21
On 28.01.21 23:29, Oscar Salvador wrote:
> On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote:
>> Extending on that, I just discovered that only x86-64, ppc64, and arm64
>> really support hugepage migration.
>>
>> Maybe one approach with the "magic switch" really would be to disable
>> hugepage migration completely in hugepage_migration_supported(), and
>> consequently making hugepage_movable_supported() always return false.
> 
> Ok, so migration would not fork for these pages, and since them would
> lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory.
> Well, we really cannot unplug it unless the hugepage is not used
> (it can be dissolved at least).
> 
> Now to the allocation-when-freeing.
> Current implementation uses GFP_ATOMIC(or wants to use) + forever loop.
> One of the problems I see with GFP_ATOMIC is that gives you access
> to memory reserves, but there are more users using those reserves.
> Then, worst-scenario case we need to allocate 16MB order-0 pages
> to free up 1GB hugepage, so the question would be whether reserves
> really scale to 16MB + more users accessing reserves.
> 
> As I said, if anything I would go for an optimistic allocation-try
> , if we fail just refuse to shrink the pool.
> User can always try to shrink it later again via /sys interface.
> 
> Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not
> expected to be migratable, is that ok?
> 
> Using the hugepage for the vmemmap array was brought up several times,
> but that would imply fragmenting memory over time.
> 
> All in all seems to be overly complicated (I might be wrong).
> 
> 
>> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be
>> migrated. The problem I describe would apply (careful with using
>> ZONE_MOVABLE), but well, it can at least be documented.
> 
> I am not a page allocator expert but cannot the allocation fallback
> to ZONE_MOVABLE under memory shortage on other zones?

No, for now it's not done. Only movable allocations target ZONE_MOVABLE. 
Doing so would be controversial: when would be the right point in time 
to start spilling unmovable allocations into CMA/ZONE_MOVABLE? You 
certainly want to try other things first (swapping, reclaim, 
compaction), before breaking any guarantees regarding 
hotunplug+migration/compaction you have with CMA/ZONE_MOVABLE. And even 
if you would allow it, your workload would already suffer extremely.

So it smells more like a setup issue. But then, who knows when 
allocating huge pages (esp. at runtime) that there are such side effects 
before actually running into them?

We can make sure that all relevant archs support migration of ordinary 
(!gigantic) huge pages (for now, only x86-64, ppc64/spapr, arm64), so we 
can place them onto ZONE_MOVABLE. It gets harder with more special cases.

Gigantic pages (without CMA) are more of a general issue, but at least 
it's simple to document ("Careful when pairing ZONE_MOVABLE with 
gigantic pages on !CMA").

An unexpected high amount of unmovable memory is just extremely 
difficult to handle with ZONE_MOVABLE; it's hard for the user/admin to 
figure out that such restrictions actually apply.
David Hildenbrand Feb. 1, 2021, 4:10 p.m. UTC | #22
>> What's your opinion about this? Should we take this approach?
> 
> I think trying to solve all the issues that could happen as the result of
> not being able to dissolve a hugetlb page has made this extremely complex.
> I know this is something we need to address/solve.  We do not want to add
> more unexpected behavior in corner cases.  However, I can not help but think
> about similar issues today.  For example, if a huge page is in use in
> ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today.

Yes, hugetlbfs is broken with alloc_contig_range() as e.g., used by CMA 
and needs fixing. Then, similar problems as with hugetlbfs pages on 
ZONE_MOVABLE apply.


hugetlbfs pages on ZONE_MOVABLE for memory unplug are problematic in 
corner cases only I think:

1. Not sufficient memory to allocate a destination page. Well, nothing 
we can really do about that - just like trying to migrate any other 
memory but running into -ENOMEM.

2. Trying to dissolve a free huge page but running into reservation 
limits. I think we should at least try allocating a new free huge page 
before failing. To be tackled in the future.

> Correct?  We may need to allocate another huge page for the target of the
> migration, and there is no guarantee we can do that.
> 

I agree that 1. is similar to "cannot migrate because OOM".


So thinking about it again, we don't actually seem to lose that much when

a) Rejecting migration of a huge page when not being able to allocate 
the vmemmap for our source page. Our system seems to be under quite some 
memory pressure already. Migration could just fail because we fail to 
allocate a migration target already.

b) Rejecting to dissolve a huge page when not able to allocate the 
vmemmap. Dissolving can fail already. And, again, our system seems to be 
under quite some memory pressure already.

c) Rejecting freeing huge pages when not able to allocate the vmemmap. I 
guess the "only" surprise is that the user might now no longer get what 
he asked for. This seems to be the "real change".


So maybe little actually speaks against allowing for migration of such 
huge pages and optimizing any huge page, besides rejecting freeing of 
huge pages and surprising the user/admin.

I guess while our system is under memory pressure CMA and ZONE_MOVABLE 
are already no longer able to always keep their guarantees - until there 
is no more memory pressure.
Mike Kravetz Feb. 2, 2021, 12:05 a.m. UTC | #23
On 2/1/21 8:10 AM, David Hildenbrand wrote:
>>> What's your opinion about this? Should we take this approach?
>>
>> I think trying to solve all the issues that could happen as the result of
>> not being able to dissolve a hugetlb page has made this extremely complex.
>> I know this is something we need to address/solve.  We do not want to add
>> more unexpected behavior in corner cases.  However, I can not help but think
>> about similar issues today.  For example, if a huge page is in use in
>> ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today.
> 
> Yes, hugetlbfs is broken with alloc_contig_range() as e.g., used by CMA and needs fixing. Then, similar problems as with hugetlbfs pages on ZONE_MOVABLE apply.
> 
> 
> hugetlbfs pages on ZONE_MOVABLE for memory unplug are problematic in corner cases only I think:
> 
> 1. Not sufficient memory to allocate a destination page. Well, nothing we can really do about that - just like trying to migrate any other memory but running into -ENOMEM.
> 
> 2. Trying to dissolve a free huge page but running into reservation limits. I think we should at least try allocating a new free huge page before failing. To be tackled in the future.
> 
>> Correct?  We may need to allocate another huge page for the target of the
>> migration, and there is no guarantee we can do that.
>>
> 
> I agree that 1. is similar to "cannot migrate because OOM".
> 
> 
> So thinking about it again, we don't actually seem to lose that much when
> 
> a) Rejecting migration of a huge page when not being able to allocate the vmemmap for our source page. Our system seems to be under quite some memory pressure already. Migration could just fail because we fail to allocate a migration target already.
> 
> b) Rejecting to dissolve a huge page when not able to allocate the vmemmap. Dissolving can fail already. And, again, our system seems to be under quite some memory pressure already.
> 
> c) Rejecting freeing huge pages when not able to allocate the vmemmap. I guess the "only" surprise is that the user might now no longer get what he asked for. This seems to be the "real change".
> 
> So maybe little actually speaks against allowing for migration of such huge pages and optimizing any huge page, besides rejecting freeing of huge pages and surprising the user/admin.
> 
> I guess while our system is under memory pressure CMA and ZONE_MOVABLE are already no longer able to always keep their guarantees - until there is no more memory pressure.
> 

My thinking was similar.  Failing to dissolve a hugetlb page because we could
not allocate vmmemmap pages is not much/any worse than what we do when near
OOM conditions today.  As for surprising the user/admin, we should certainly
log a warning if we can not dissolve a hugetlb page.

One point David R brought up still is a bit concerning.  When getting close
to OOM, there may be users/code that will try to dissolve free hugetlb pages
to give back as much memory as possible to buddy.  I've seen users holding
'big chunks' of memory for a specific purpose and dumping them when needed.
They were not doing this with hugetlb pages, but nothing would surprise me.

In this series, vmmap freeing is 'opt in' at boot time.  I would expect
the use cases that want to opt in rarely if ever free/dissolve hugetlb
pages.  But, I could be wrong.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f928994ed273..16b55d13b0ab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3007,6 +3007,8 @@  static inline void print_vma_addr(char *prefix, unsigned long rip)
 
 void vmemmap_remap_free(unsigned long start, unsigned long end,
 			unsigned long reuse);
+void vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			 unsigned long reuse);
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c165186ec2cf..d11c32fcdb38 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1326,6 +1326,8 @@  static void update_hpage_vmemmap_workfn(struct work_struct *work)
 		page->mapping = NULL;
 		h = page_hstate(page);
 
+		alloc_huge_page_vmemmap(h, page);
+
 		spin_lock(&hugetlb_lock);
 		__free_hugepage(h, page);
 		spin_unlock(&hugetlb_lock);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 19f1898aaede..6108ae80314f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -183,6 +183,21 @@  static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
 }
 
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_addr = (unsigned long)head;
+	unsigned long vmemmap_end, vmemmap_reuse;
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
+	vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse);
+}
+
 void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 01f8637adbe0..b2c8d2f11d48 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@ 
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,10 @@  static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 	return 0;
 }
 #else
+static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index ce4be1fa93c2..3b146d5949f3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,7 @@ 
 #include <linux/sched.h>
 #include <linux/pgtable.h>
 #include <linux/bootmem_info.h>
+#include <linux/delay.h>
 
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
@@ -40,7 +41,8 @@ 
  * @remap_pte:		called for each non-empty PTE (lowest-level) entry.
  * @reuse_page:		the page which is reused for the tail vmemmap pages.
  * @reuse_addr:		the virtual address of the @reuse_page page.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
  */
 struct vmemmap_remap_walk {
 	void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -50,6 +52,10 @@  struct vmemmap_remap_walk {
 	struct list_head *vmemmap_pages;
 };
 
+/* The gfp mask of allocating vmemmap page */
+#define GFP_VMEMMAP_PAGE		\
+	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
+
 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 			      unsigned long end,
 			      struct vmemmap_remap_walk *walk)
@@ -228,6 +234,75 @@  void vmemmap_remap_free(unsigned long start, unsigned long end,
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static void alloc_vmemmap_page_list(struct list_head *list,
+				    unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		struct page *page;
+		int nid = page_to_nid((const void *)addr);
+
+retry:
+		page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
+		if (unlikely(!page)) {
+			msleep(100);
+			/*
+			 * We should retry infinitely, because we cannot
+			 * handle allocation failures. Once we allocate
+			 * vmemmap pages successfully, then we can free
+			 * a HugeTLB page.
+			 */
+			goto retry;
+		}
+		list_add_tail(&page->lru, list);
+	}
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *			 to the page which is from the @vmemmap_pages
+ *			 respectively.
+ * @start:	start address of the vmemmap virtual address range.
+ * @end:	end address of the vmemmap virtual address range.
+ * @reuse:	reuse address.
+ */
+void vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			 unsigned long reuse)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_restore_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	might_sleep();
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	alloc_vmemmap_page_list(&vmemmap_pages, start, end);
+	vmemmap_remap_range(reuse, end, &walk);
+}
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.