diff mbox series

[v14,4/8] mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page

Message ID 20210204035043.36609-5-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of HugeTLB page | expand

Commit Message

Muchun Song Feb. 4, 2021, 3:50 a.m. UTC
When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. But we may cannot allocate vmemmap pages
when the system is under memory pressure, in this case, we just refuse to
free the HugeTLB page instead of looping forever trying to allocate the
pages.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h   |  2 ++
 mm/hugetlb.c         | 19 ++++++++++++-
 mm/hugetlb_vmemmap.c | 30 +++++++++++++++++++++
 mm/hugetlb_vmemmap.h |  8 ++++++
 mm/sparse-vmemmap.c  | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 132 insertions(+), 2 deletions(-)

Comments

Muchun Song Feb. 5, 2021, 9:29 a.m. UTC | #1
On Thu, Feb 4, 2021 at 11:54 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> When we free a HugeTLB page to the buddy allocator, we should allocate the
> vmemmap pages associated with it. But we may cannot allocate vmemmap pages
> when the system is under memory pressure, in this case, we just refuse to
> free the HugeTLB page instead of looping forever trying to allocate the
> pages.
>
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  include/linux/mm.h   |  2 ++
>  mm/hugetlb.c         | 19 ++++++++++++-
>  mm/hugetlb_vmemmap.c | 30 +++++++++++++++++++++
>  mm/hugetlb_vmemmap.h |  8 ++++++
>  mm/sparse-vmemmap.c  | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 132 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d7dddf334779..33c5911afe18 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2981,6 +2981,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
>
>  void vmemmap_remap_free(unsigned long start, unsigned long end,
>                         unsigned long reuse);
> +int vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +                       unsigned long reuse, gfp_t gfp_mask);
>
>  void *sparse_buffer_alloc(unsigned long size);
>  struct page * __populate_section_memmap(unsigned long pfn,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4cfca27c6d32..5518283aa667 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1397,16 +1397,26 @@ static void __free_huge_page(struct page *page)
>                 h->resv_huge_pages++;
>
>         if (HPageTemporary(page)) {
> -               list_del(&page->lru);
>                 ClearHPageTemporary(page);
> +
> +               if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC)) {
> +                       h->surplus_huge_pages++;
> +                       h->surplus_huge_pages_node[nid]++;
> +                       goto enqueue;
> +               }
> +               list_del(&page->lru);
>                 update_and_free_page(h, page);
>         } else if (h->surplus_huge_pages_node[nid]) {
> +               if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
> +                       goto enqueue;
> +
>                 /* remove the page from active list */
>                 list_del(&page->lru);
>                 update_and_free_page(h, page);
>                 h->surplus_huge_pages--;
>                 h->surplus_huge_pages_node[nid]--;
>         } else {
> +enqueue:
>                 arch_clear_hugepage_flags(page);
>                 enqueue_huge_page(h, page);
>         }
> @@ -1693,6 +1703,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
>                         struct page *page =
>                                 list_entry(h->hugepage_freelists[node].next,
>                                           struct page, lru);
> +
> +                       if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
> +                               break;
> +
>                         list_del(&page->lru);
>                         h->free_huge_pages--;
>                         h->free_huge_pages_node[node]--;
> @@ -1760,6 +1774,9 @@ int dissolve_free_huge_page(struct page *page)
>                         goto retry;
>                 }
>
> +               if (alloc_huge_page_vmemmap(h, head, GFP_ATOMIC))
> +                       goto out;
> +

Hi Oscar,

Because we allocate vmemmap pages and do the remapping
before setting the PG_hwpoision of tail struct page. So we do
not need the following patch.

[1] https://patchwork.kernel.org/project/linux-mm/patch/20210117151053.24600-7-songmuchun@bytedance.com/

>                 /*
>                  * Move PageHWPoison flag from head page to the raw error page,
>                  * which makes any subpages rather than the error page reusable.
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index ddd872ab6180..0bd6b8d7282d 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -169,6 +169,8 @@
>   * (last) level. So this type of HugeTLB page can be optimized only when its
>   * size of the struct page structs is greater than 2 pages.
>   */
> +#define pr_fmt(fmt)    "HugeTLB: " fmt
> +
>  #include "hugetlb_vmemmap.h"
>
>  /*
> @@ -198,6 +200,34 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
>         return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
>  }
>
> +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head, gfp_t gfp_mask)
> +{
> +       int ret;
> +       unsigned long vmemmap_addr = (unsigned long)head;
> +       unsigned long vmemmap_end, vmemmap_reuse;
> +
> +       if (!free_vmemmap_pages_per_hpage(h))
> +               return 0;
> +
> +       vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> +       vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
> +       vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
> +
> +       /*
> +        * The pages which the vmemmap virtual address range [@vmemmap_addr,
> +        * @vmemmap_end) are mapped to are freed to the buddy allocator, and
> +        * the range is mapped to the page which @vmemmap_reuse is mapped to.
> +        * When a HugeTLB page is freed to the buddy allocator, previously
> +        * discarded vmemmap pages must be allocated and remapping.
> +        */
> +       ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
> +                                 gfp_mask | __GFP_NOWARN | __GFP_THISNODE);
> +       if (ret == -ENOMEM)
> +               pr_info("cannot alloc vmemmap pages\n");
> +
> +       return ret;
> +}
> +
>  void free_huge_page_vmemmap(struct hstate *h, struct page *head)
>  {
>         unsigned long vmemmap_addr = (unsigned long)head;
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 6923f03534d5..6f89a9eed02c 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -11,8 +11,16 @@
>  #include <linux/hugetlb.h>
>
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head,
> +                           gfp_t gfp_mask);
>  void free_huge_page_vmemmap(struct hstate *h, struct page *head);
>  #else
> +static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head,
> +                                         gfp_t gfp_mask)
> +{
> +       return 0;
> +}
> +
>  static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
>  {
>  }
> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index 50c1dc00b686..277eb43aebd5 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -40,7 +40,8 @@
>   * @remap_pte:         called for each non-empty PTE (lowest-level) entry.
>   * @reuse_page:                the page which is reused for the tail vmemmap pages.
>   * @reuse_addr:                the virtual address of the @reuse_page page.
> - * @vmemmap_pages:     the list head of the vmemmap pages that can be freed.
> + * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
> + *                     or is mapped from.
>   */
>  struct vmemmap_remap_walk {
>         void (*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -237,6 +238,78 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>         free_vmemmap_page_list(&vmemmap_pages);
>  }
>
> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> +                               struct vmemmap_remap_walk *walk)
> +{
> +       pgprot_t pgprot = PAGE_KERNEL;
> +       struct page *page;
> +       void *to;
> +
> +       BUG_ON(pte_page(*pte) != walk->reuse_page);
> +
> +       page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> +       list_del(&page->lru);
> +       to = page_to_virt(page);
> +       copy_page(to, (void *)walk->reuse_addr);
> +
> +       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +}
> +
> +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
> +                                  gfp_t gfp_mask, struct list_head *list)
> +{
> +       unsigned long addr;
> +       int nid = page_to_nid((const void *)start);
> +       struct page *page, *next;
> +
> +       for (addr = start; addr < end; addr += PAGE_SIZE) {
> +               page = alloc_pages_node(nid, gfp_mask, 0);
> +               if (!page)
> +                       goto out;
> +               list_add_tail(&page->lru, list);
> +       }
> +
> +       return 0;
> +out:
> +       list_for_each_entry_safe(page, next, list, lru)
> +               __free_pages(page, 0);
> +       return -ENOMEM;
> +}
> +
> +/**
> + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> + *                      to the page which is from the @vmemmap_pages
> + *                      respectively.
> + * @start:     start address of the vmemmap virtual address range that we want
> + *             to remap.
> + * @end:       end address of the vmemmap virtual address range that we want to
> + *             remap.
> + * @reuse:     reuse address.
> + * @gpf_mask:  GFP flag for allocating vmemmap pages.
> + */
> +int vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +                       unsigned long reuse, gfp_t gfp_mask)
> +{
> +       LIST_HEAD(vmemmap_pages);
> +       struct vmemmap_remap_walk walk = {
> +               .remap_pte      = vmemmap_restore_pte,
> +               .reuse_addr     = reuse,
> +               .vmemmap_pages  = &vmemmap_pages,
> +       };
> +
> +       /* See the comment in the vmemmap_remap_free(). */
> +       BUG_ON(start - reuse != PAGE_SIZE);
> +
> +       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
> +
> +       if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
> +               return -ENOMEM;
> +
> +       vmemmap_remap_range(reuse, end, &walk);
> +
> +       return 0;
> +}
> +
>  /*
>   * Allocate a block of memory to be used to back the virtual memory map
>   * or to back the page tables that are used to create the mapping.
> --
> 2.11.0
>
Oscar Salvador Feb. 5, 2021, 11:54 a.m. UTC | #2
On Thu, Feb 04, 2021 at 11:50:39AM +0800, Muchun Song wrote:
> When we free a HugeTLB page to the buddy allocator, we should allocate the
> vmemmap pages associated with it. But we may cannot allocate vmemmap pages
> when the system is under memory pressure, in this case, we just refuse to
> free the HugeTLB page instead of looping forever trying to allocate the
> pages.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>

[...]

> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4cfca27c6d32..5518283aa667 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1397,16 +1397,26 @@ static void __free_huge_page(struct page *page)
>  		h->resv_huge_pages++;
>  
>  	if (HPageTemporary(page)) {
> -		list_del(&page->lru);
>  		ClearHPageTemporary(page);
> +
> +		if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC)) {
> +			h->surplus_huge_pages++;
> +			h->surplus_huge_pages_node[nid]++;
> +			goto enqueue;
> +		}
> +		list_del(&page->lru);
>  		update_and_free_page(h, page);
>  	} else if (h->surplus_huge_pages_node[nid]) {
> +		if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
> +			goto enqueue;
> +
>  		/* remove the page from active list */
>  		list_del(&page->lru);
>  		update_and_free_page(h, page);
>  		h->surplus_huge_pages--;
>  		h->surplus_huge_pages_node[nid]--;
>  	} else {
> +enqueue:
>  		arch_clear_hugepage_flags(page);
>  		enqueue_huge_page(h, page);

Ok, we just keep them in the pool in case we fail to allocate.


> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index ddd872ab6180..0bd6b8d7282d 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -169,6 +169,8 @@
>   * (last) level. So this type of HugeTLB page can be optimized only when its
>   * size of the struct page structs is greater than 2 pages.

[...]

> +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head, gfp_t gfp_mask)
> +{
> +	int ret;
> +	unsigned long vmemmap_addr = (unsigned long)head;
> +	unsigned long vmemmap_end, vmemmap_reuse;
> +
> +	if (!free_vmemmap_pages_per_hpage(h))
> +		return 0;
> +
> +	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> +	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
> +	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
> +
> +	/*
> +	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
> +	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
> +	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
> +	 * When a HugeTLB page is freed to the buddy allocator, previously
> +	 * discarded vmemmap pages must be allocated and remapping.
> +	 */
> +	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
> +				  gfp_mask | __GFP_NOWARN | __GFP_THISNODE);

Why don't you set all the GFP flags here?

vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_ATOMIC|
                    __GFP_NOWARN | __GFP_THISNODE) ?

> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index 50c1dc00b686..277eb43aebd5 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c

[...]

> +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
> +				   gfp_t gfp_mask, struct list_head *list)

I think it would make more sense for this function to get the nid and the
nr_pages to allocate directly.

> +{
> +	unsigned long addr;
> +	int nid = page_to_nid((const void *)start);

Uh, that void is a bit ugly. page_to_nid(struct page *)start).
Do not need the const either.

> +	struct page *page, *next;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE) {
> +		page = alloc_pages_node(nid, gfp_mask, 0);
> +		if (!page)
> +			goto out;
> +		list_add_tail(&page->lru, list);
> +	}

and replace this by while(--nr_pages) etc.

I did not really go in depth, but looks good to me, and much more simply
overall.

The only thing I am not sure about is the use of GFP_ATOMIC.
It has been raised before than when we are close to OOM, the user might want
to try to free up some memory by dissolving free_huge_pages, and so we might
want to dip in the reserves.

Given the fact that we are prepared to fail, and that we do not retry, I would
rather use GFP_KERNEL than to have X pages atomically allocated and then realize
we need to drop them on the ground because we cannot go further at some point.
I think those reserves would be better off used by someone else in that
situation.

But this is just my thoughs, and given the fact that there seems to be a consensus
of susing GFP_ATOMIC.
Muchun Song Feb. 6, 2021, 8:01 a.m. UTC | #3
On Fri, Feb 5, 2021 at 7:54 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Thu, Feb 04, 2021 at 11:50:39AM +0800, Muchun Song wrote:
> > When we free a HugeTLB page to the buddy allocator, we should allocate the
> > vmemmap pages associated with it. But we may cannot allocate vmemmap pages
> > when the system is under memory pressure, in this case, we just refuse to
> > free the HugeTLB page instead of looping forever trying to allocate the
> > pages.
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>
> [...]
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 4cfca27c6d32..5518283aa667 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1397,16 +1397,26 @@ static void __free_huge_page(struct page *page)
> >               h->resv_huge_pages++;
> >
> >       if (HPageTemporary(page)) {
> > -             list_del(&page->lru);
> >               ClearHPageTemporary(page);
> > +
> > +             if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC)) {
> > +                     h->surplus_huge_pages++;
> > +                     h->surplus_huge_pages_node[nid]++;
> > +                     goto enqueue;
> > +             }
> > +             list_del(&page->lru);
> >               update_and_free_page(h, page);
> >       } else if (h->surplus_huge_pages_node[nid]) {
> > +             if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
> > +                     goto enqueue;
> > +
> >               /* remove the page from active list */
> >               list_del(&page->lru);
> >               update_and_free_page(h, page);
> >               h->surplus_huge_pages--;
> >               h->surplus_huge_pages_node[nid]--;
> >       } else {
> > +enqueue:
> >               arch_clear_hugepage_flags(page);
> >               enqueue_huge_page(h, page);
>
> Ok, we just keep them in the pool in case we fail to allocate.
>
>
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index ddd872ab6180..0bd6b8d7282d 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -169,6 +169,8 @@
> >   * (last) level. So this type of HugeTLB page can be optimized only when its
> >   * size of the struct page structs is greater than 2 pages.
>
> [...]
>
> > +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head, gfp_t gfp_mask)
> > +{
> > +     int ret;
> > +     unsigned long vmemmap_addr = (unsigned long)head;
> > +     unsigned long vmemmap_end, vmemmap_reuse;
> > +
> > +     if (!free_vmemmap_pages_per_hpage(h))
> > +             return 0;
> > +
> > +     vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> > +     vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
> > +     vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
> > +
> > +     /*
> > +      * The pages which the vmemmap virtual address range [@vmemmap_addr,
> > +      * @vmemmap_end) are mapped to are freed to the buddy allocator, and
> > +      * the range is mapped to the page which @vmemmap_reuse is mapped to.
> > +      * When a HugeTLB page is freed to the buddy allocator, previously
> > +      * discarded vmemmap pages must be allocated and remapping.
> > +      */
> > +     ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
> > +                               gfp_mask | __GFP_NOWARN | __GFP_THISNODE);
>
> Why don't you set all the GFP flags here?

Originally, I wanted to let the caller know the GFP flag which they
used. But setting all the GFP flags here also makes sense to me.
And we can remove the @gfp_mask parameter of the
alloc_huge_page_vmemmap. It is simple.

>
> vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_ATOMIC|
>                     __GFP_NOWARN | __GFP_THISNODE) ?

I will use this.

>
> > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > index 50c1dc00b686..277eb43aebd5 100644
> > --- a/mm/sparse-vmemmap.c
> > +++ b/mm/sparse-vmemmap.c
>
> [...]
>
> > +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
> > +                                gfp_t gfp_mask, struct list_head *list)
>
> I think it would make more sense for this function to get the nid and the
> nr_pages to allocate directly.

Just like alloc_pages(), right? If so, make sense to me.

>
> > +{
> > +     unsigned long addr;
> > +     int nid = page_to_nid((const void *)start);
>
> Uh, that void is a bit ugly. page_to_nid(struct page *)start).
> Do not need the const either.

OK. Will do. Thanks.

>
> > +     struct page *page, *next;
> > +
> > +     for (addr = start; addr < end; addr += PAGE_SIZE) {
> > +             page = alloc_pages_node(nid, gfp_mask, 0);
> > +             if (!page)
> > +                     goto out;
> > +             list_add_tail(&page->lru, list);
> > +     }
>
> and replace this by while(--nr_pages) etc.

OK. Will do.

>
> I did not really go in depth, but looks good to me, and much more simply
> overall.

Yeah. The series only has 8 patches now. It is simpler.

>
> The only thing I am not sure about is the use of GFP_ATOMIC.
> It has been raised before than when we are close to OOM, the user might want
> to try to free up some memory by dissolving free_huge_pages, and so we might
> want to dip in the reserves.
>
> Given the fact that we are prepared to fail, and that we do not retry, I would
> rather use GFP_KERNEL than to have X pages atomically allocated and then realize
> we need to drop them on the ground because we cannot go further at some point.
> I think those reserves would be better off used by someone else in that
> situation.
>
> But this is just my thoughs, and given the fact that there seems to be a consensus
> of susing GFP_ATOMIC.
>
> --
> Oscar Salvador
> SUSE L3
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d7dddf334779..33c5911afe18 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2981,6 +2981,8 @@  static inline void print_vma_addr(char *prefix, unsigned long rip)
 
 void vmemmap_remap_free(unsigned long start, unsigned long end,
 			unsigned long reuse);
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			unsigned long reuse, gfp_t gfp_mask);
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4cfca27c6d32..5518283aa667 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1397,16 +1397,26 @@  static void __free_huge_page(struct page *page)
 		h->resv_huge_pages++;
 
 	if (HPageTemporary(page)) {
-		list_del(&page->lru);
 		ClearHPageTemporary(page);
+
+		if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC)) {
+			h->surplus_huge_pages++;
+			h->surplus_huge_pages_node[nid]++;
+			goto enqueue;
+		}
+		list_del(&page->lru);
 		update_and_free_page(h, page);
 	} else if (h->surplus_huge_pages_node[nid]) {
+		if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
+			goto enqueue;
+
 		/* remove the page from active list */
 		list_del(&page->lru);
 		update_and_free_page(h, page);
 		h->surplus_huge_pages--;
 		h->surplus_huge_pages_node[nid]--;
 	} else {
+enqueue:
 		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
 	}
@@ -1693,6 +1703,10 @@  static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 			struct page *page =
 				list_entry(h->hugepage_freelists[node].next,
 					  struct page, lru);
+
+			if (alloc_huge_page_vmemmap(h, page, GFP_ATOMIC))
+				break;
+
 			list_del(&page->lru);
 			h->free_huge_pages--;
 			h->free_huge_pages_node[node]--;
@@ -1760,6 +1774,9 @@  int dissolve_free_huge_page(struct page *page)
 			goto retry;
 		}
 
+		if (alloc_huge_page_vmemmap(h, head, GFP_ATOMIC))
+			goto out;
+
 		/*
 		 * Move PageHWPoison flag from head page to the raw error page,
 		 * which makes any subpages rather than the error page reusable.
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ddd872ab6180..0bd6b8d7282d 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -169,6 +169,8 @@ 
  * (last) level. So this type of HugeTLB page can be optimized only when its
  * size of the struct page structs is greater than 2 pages.
  */
+#define pr_fmt(fmt)	"HugeTLB: " fmt
+
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -198,6 +200,34 @@  static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
 }
 
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head, gfp_t gfp_mask)
+{
+	int ret;
+	unsigned long vmemmap_addr = (unsigned long)head;
+	unsigned long vmemmap_end, vmemmap_reuse;
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return 0;
+
+	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
+	/*
+	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
+	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
+	 * When a HugeTLB page is freed to the buddy allocator, previously
+	 * discarded vmemmap pages must be allocated and remapping.
+	 */
+	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+				  gfp_mask | __GFP_NOWARN | __GFP_THISNODE);
+	if (ret == -ENOMEM)
+		pr_info("cannot alloc vmemmap pages\n");
+
+	return ret;
+}
+
 void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 6923f03534d5..6f89a9eed02c 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,8 +11,16 @@ 
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head,
+			    gfp_t gfp_mask);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 #else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head,
+					  gfp_t gfp_mask)
+{
+	return 0;
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 50c1dc00b686..277eb43aebd5 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ 
  * @remap_pte:		called for each non-empty PTE (lowest-level) entry.
  * @reuse_page:		the page which is reused for the tail vmemmap pages.
  * @reuse_addr:		the virtual address of the @reuse_page page.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
  */
 struct vmemmap_remap_walk {
 	void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -237,6 +238,78 @@  void vmemmap_remap_free(unsigned long start, unsigned long end,
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+				   gfp_t gfp_mask, struct list_head *list)
+{
+	unsigned long addr;
+	int nid = page_to_nid((const void *)start);
+	struct page *page, *next;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		page = alloc_pages_node(nid, gfp_mask, 0);
+		if (!page)
+			goto out;
+		list_add_tail(&page->lru, list);
+	}
+
+	return 0;
+out:
+	list_for_each_entry_safe(page, next, list, lru)
+		__free_pages(page, 0);
+	return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *			 to the page which is from the @vmemmap_pages
+ *			 respectively.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ * @gpf_mask:	GFP flag for allocating vmemmap pages.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			unsigned long reuse, gfp_t gfp_mask)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_restore_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+		return -ENOMEM;
+
+	vmemmap_remap_range(reuse, end, &walk);
+
+	return 0;
+}
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.