diff mbox series

[v2,07/19] mm/hugetlb: Free the vmemmap pages associated with each hugetlb page

Message ID 20201026145114.59424-8-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of hugetlb page | expand

Commit Message

Muchun Song Oct. 26, 2020, 2:51 p.m. UTC
When we allocate a hugetlb page from the buddy, we should free the
unused vmemmap pages associated with it. We can do that in the
prep_new_huge_page().

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/x86/include/asm/hugetlb.h          |   7 +
 arch/x86/include/asm/pgtable_64_types.h |   8 +
 include/linux/hugetlb.h                 |   7 +
 mm/hugetlb.c                            | 190 ++++++++++++++++++++++++
 4 files changed, 212 insertions(+)

Comments

Matthew Wilcox (Oracle) Oct. 26, 2020, 4:01 p.m. UTC | #1
On Mon, Oct 26, 2020 at 10:51:02PM +0800, Muchun Song wrote:
> +static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long addr)
> +{
> +	struct mm_struct *mm = &init_mm;
> +	struct page *page;
> +	pmd_t old_pmd, _pmd;
> +	int i;
> +
> +	old_pmd = READ_ONCE(*pmd);
> +	page = pmd_page(old_pmd);
> +	pmd_populate_kernel(mm, &_pmd, pte_p);
> +
> +	for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
> +		pte_t entry, *pte;
> +
> +		entry = mk_pte(page + i, PAGE_KERNEL);

I'd be happier if that were:

	pgprot_t pgprot = PAGE_KERNEL;
...
	for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
		pte_t entry, *pte;

		entry = mk_pte(page + i, pgprot);
		pgprot = PAGE_KERNEL_RO;

so that all subsequent tail pages are mapped read-only.
Muchun Song Oct. 27, 2020, 2:58 a.m. UTC | #2
On Tue, Oct 27, 2020 at 12:01 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Oct 26, 2020 at 10:51:02PM +0800, Muchun Song wrote:
> > +static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long addr)
> > +{
> > +     struct mm_struct *mm = &init_mm;
> > +     struct page *page;
> > +     pmd_t old_pmd, _pmd;
> > +     int i;
> > +
> > +     old_pmd = READ_ONCE(*pmd);
> > +     page = pmd_page(old_pmd);
> > +     pmd_populate_kernel(mm, &_pmd, pte_p);
> > +
> > +     for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
> > +             pte_t entry, *pte;
> > +
> > +             entry = mk_pte(page + i, PAGE_KERNEL);
>
> I'd be happier if that were:
>
>         pgprot_t pgprot = PAGE_KERNEL;
> ...
>         for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
>                 pte_t entry, *pte;
>
>                 entry = mk_pte(page + i, pgprot);
>                 pgprot = PAGE_KERNEL_RO;
>
> so that all subsequent tail pages are mapped read-only.
>

Good idea, do this can catch some illegal operations. Thanks.
Mike Kravetz Oct. 28, 2020, 11:42 p.m. UTC | #3
On 10/26/20 7:51 AM, Muchun Song wrote:
> When we allocate a hugetlb page from the buddy, we should free the
> unused vmemmap pages associated with it. We can do that in the
> prep_new_huge_page().
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  arch/x86/include/asm/hugetlb.h          |   7 +
>  arch/x86/include/asm/pgtable_64_types.h |   8 +
>  include/linux/hugetlb.h                 |   7 +
>  mm/hugetlb.c                            | 190 ++++++++++++++++++++++++
>  4 files changed, 212 insertions(+)
> 
> diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> index f5e882f999cd..7c3eb60c2198 100644
> --- a/arch/x86/include/asm/hugetlb.h
> +++ b/arch/x86/include/asm/hugetlb.h
> @@ -4,10 +4,17 @@
>  
>  #include <asm/page.h>
>  #include <asm-generic/hugetlb.h>
> +#include <asm/pgtable.h>
>  
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
>  #define VMEMMAP_HPAGE_SHIFT			PMD_SHIFT
>  #define arch_vmemmap_support_huge_mapping()	boot_cpu_has(X86_FEATURE_PSE)
> +
> +#define vmemmap_pmd_huge vmemmap_pmd_huge
> +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> +{
> +	return pmd_large(*pmd);
> +}
>  #endif
>  
>  #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 52e5f5f2240d..bedbd2e7d06c 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -139,6 +139,14 @@ extern unsigned int ptrs_per_p4d;
>  # define VMEMMAP_START		__VMEMMAP_BASE_L4
>  #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
>  
> +/*
> + * VMEMMAP_SIZE - allows the whole linear region to be covered by
> + *                a struct page array.
> + */
> +#define VMEMMAP_SIZE		(1UL << (__VIRTUAL_MASK_SHIFT - PAGE_SHIFT - \
> +					 1 + ilog2(sizeof(struct page))))
> +#define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
> +
>  #define VMALLOC_END		(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
>  
>  #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ace304a6196c..919f47d77117 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -601,6 +601,13 @@ static inline bool arch_vmemmap_support_huge_mapping(void)
>  }
>  #endif
>  
> +#ifndef vmemmap_pmd_huge

Let's add
#define vmemmap_pmd_huge vmemmap_pmd_huge
just in case code gets moved around in header file.

> +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> +{
> +	return pmd_huge(*pmd);
> +}
> +#endif
> +
>  #ifndef VMEMMAP_HPAGE_SHIFT
>  #define VMEMMAP_HPAGE_SHIFT		PMD_SHIFT
>  #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d6ae9b6876be..aa012d603e06 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1293,10 +1293,20 @@ static inline void destroy_compound_gigantic_page(struct page *page,
>  #endif
>  
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +#include <linux/bootmem_info.h>
> +
>  #define RESERVE_VMEMMAP_NR	2U
> +#define RESERVE_VMEMMAP_SIZE	(RESERVE_VMEMMAP_NR << PAGE_SHIFT)

Since RESERVE_VMEMMAP_SIZE is not used here, perhaps it should be added
in the patch where it is first used.

>  
>  #define page_huge_pte(page)	((page)->pmd_huge_pte)
>  
> +#define vmemmap_hpage_addr_end(addr, end)				\
> +({									\
> +	unsigned long __boundary;					\
> +	__boundary = ((addr) + VMEMMAP_HPAGE_SIZE) & VMEMMAP_HPAGE_MASK;\
> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
> +})
> +
>  static inline unsigned int nr_free_vmemmap(struct hstate *h)
>  {
>  	return h->nr_free_vmemmap_pages;
> @@ -1416,6 +1426,181 @@ static void __init hugetlb_vmemmap_init(struct hstate *h)
>  	pr_info("HugeTLB: can free %d vmemmap pages for %s\n",
>  		h->nr_free_vmemmap_pages, h->name);
>  }
> +
> +static inline spinlock_t *vmemmap_pmd_lockptr(pmd_t *pmd)
> +{
> +	static DEFINE_SPINLOCK(pgtable_lock);
> +
> +	return &pgtable_lock;
> +}

This is just a global lock.  Correct?  And hugetlb specific?

It should be OK as the page table entries for huegtlb pages will not
overlap with other entries.

> +
> +/*
> + * Walk a vmemmap address to the pmd it maps.
> + */
> +static pmd_t *vmemmap_to_pmd(const void *page)
> +{
> +	unsigned long addr = (unsigned long)page;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	if (addr < VMEMMAP_START || addr >= VMEMMAP_END)
> +		return NULL;
> +
> +	pgd = pgd_offset_k(addr);
> +	if (pgd_none(*pgd))
> +		return NULL;
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
> +		return NULL;
> +	pud = pud_offset(p4d, addr);
> +
> +	WARN_ON_ONCE(pud_bad(*pud));
> +	if (pud_none(*pud) || pud_bad(*pud))
> +		return NULL;
> +	pmd = pmd_offset(pud, addr);
> +
> +	return pmd;
> +}

That routine is not really hugetlb specific.  Perhaps we could move it
to sparse-vmemmap.c?  Or elsewhere?
Muchun Song Oct. 29, 2020, 6:13 a.m. UTC | #4
On Thu, Oct 29, 2020 at 7:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 10/26/20 7:51 AM, Muchun Song wrote:
> > When we allocate a hugetlb page from the buddy, we should free the
> > unused vmemmap pages associated with it. We can do that in the
> > prep_new_huge_page().
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  arch/x86/include/asm/hugetlb.h          |   7 +
> >  arch/x86/include/asm/pgtable_64_types.h |   8 +
> >  include/linux/hugetlb.h                 |   7 +
> >  mm/hugetlb.c                            | 190 ++++++++++++++++++++++++
> >  4 files changed, 212 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> > index f5e882f999cd..7c3eb60c2198 100644
> > --- a/arch/x86/include/asm/hugetlb.h
> > +++ b/arch/x86/include/asm/hugetlb.h
> > @@ -4,10 +4,17 @@
> >
> >  #include <asm/page.h>
> >  #include <asm-generic/hugetlb.h>
> > +#include <asm/pgtable.h>
> >
> >  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> >  #define VMEMMAP_HPAGE_SHIFT                  PMD_SHIFT
> >  #define arch_vmemmap_support_huge_mapping()  boot_cpu_has(X86_FEATURE_PSE)
> > +
> > +#define vmemmap_pmd_huge vmemmap_pmd_huge
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > +     return pmd_large(*pmd);
> > +}
> >  #endif
> >
> >  #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
> > diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> > index 52e5f5f2240d..bedbd2e7d06c 100644
> > --- a/arch/x86/include/asm/pgtable_64_types.h
> > +++ b/arch/x86/include/asm/pgtable_64_types.h
> > @@ -139,6 +139,14 @@ extern unsigned int ptrs_per_p4d;
> >  # define VMEMMAP_START               __VMEMMAP_BASE_L4
> >  #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
> >
> > +/*
> > + * VMEMMAP_SIZE - allows the whole linear region to be covered by
> > + *                a struct page array.
> > + */
> > +#define VMEMMAP_SIZE         (1UL << (__VIRTUAL_MASK_SHIFT - PAGE_SHIFT - \
> > +                                      1 + ilog2(sizeof(struct page))))
> > +#define VMEMMAP_END          (VMEMMAP_START + VMEMMAP_SIZE)
> > +
> >  #define VMALLOC_END          (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
> >
> >  #define MODULES_VADDR                (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index ace304a6196c..919f47d77117 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -601,6 +601,13 @@ static inline bool arch_vmemmap_support_huge_mapping(void)
> >  }
> >  #endif
> >
> > +#ifndef vmemmap_pmd_huge
>
> Let's add
> #define vmemmap_pmd_huge vmemmap_pmd_huge
> just in case code gets moved around in header file.

OK, will do.

>
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > +     return pmd_huge(*pmd);
> > +}
> > +#endif
> > +
> >  #ifndef VMEMMAP_HPAGE_SHIFT
> >  #define VMEMMAP_HPAGE_SHIFT          PMD_SHIFT
> >  #endif
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d6ae9b6876be..aa012d603e06 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1293,10 +1293,20 @@ static inline void destroy_compound_gigantic_page(struct page *page,
> >  #endif
> >
> >  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#include <linux/bootmem_info.h>
> > +
> >  #define RESERVE_VMEMMAP_NR   2U
> > +#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
>
> Since RESERVE_VMEMMAP_SIZE is not used here, perhaps it should be added
> in the patch where it is first used.

Will do.

>
> >
> >  #define page_huge_pte(page)  ((page)->pmd_huge_pte)
> >
> > +#define vmemmap_hpage_addr_end(addr, end)                            \
> > +({                                                                   \
> > +     unsigned long __boundary;                                       \
> > +     __boundary = ((addr) + VMEMMAP_HPAGE_SIZE) & VMEMMAP_HPAGE_MASK;\
> > +     (__boundary - 1 < (end) - 1) ? __boundary : (end);              \
> > +})
> > +
> >  static inline unsigned int nr_free_vmemmap(struct hstate *h)
> >  {
> >       return h->nr_free_vmemmap_pages;
> > @@ -1416,6 +1426,181 @@ static void __init hugetlb_vmemmap_init(struct hstate *h)
> >       pr_info("HugeTLB: can free %d vmemmap pages for %s\n",
> >               h->nr_free_vmemmap_pages, h->name);
> >  }
> > +
> > +static inline spinlock_t *vmemmap_pmd_lockptr(pmd_t *pmd)
> > +{
> > +     static DEFINE_SPINLOCK(pgtable_lock);
> > +
> > +     return &pgtable_lock;
> > +}
>
> This is just a global lock.  Correct?  And hugetlb specific?

Yes, it is a global lock. Originally, I wanted to use the pmd lock(e.g.
pmd_lockptr()). But we need to allocate memory for the spinlock and
initialize it when ALLOC_SPLIT_PTLOCKS. It may increase the
complexity.

And I think that here alloc/free hugetlb pages is not a frequent operation.
So I finally use a global lock. Maybe it is enough.

>
> It should be OK as the page table entries for huegtlb pages will not
> overlap with other entries.

Does "hugetlb specific" mean the pmd lock? or per hugetlb lock?
If it is pmd lock, this is fine to me. If not, it may not be enough.
Because the lock also guards the splitting of pmd pgtable.

Thanks.
>
> > +
> > +/*
> > + * Walk a vmemmap address to the pmd it maps.
> > + */
> > +static pmd_t *vmemmap_to_pmd(const void *page)
> > +{
> > +     unsigned long addr = (unsigned long)page;
> > +     pgd_t *pgd;
> > +     p4d_t *p4d;
> > +     pud_t *pud;
> > +     pmd_t *pmd;
> > +
> > +     if (addr < VMEMMAP_START || addr >= VMEMMAP_END)
> > +             return NULL;
> > +
> > +     pgd = pgd_offset_k(addr);
> > +     if (pgd_none(*pgd))
> > +             return NULL;
> > +     p4d = p4d_offset(pgd, addr);
> > +     if (p4d_none(*p4d))
> > +             return NULL;
> > +     pud = pud_offset(p4d, addr);
> > +
> > +     WARN_ON_ONCE(pud_bad(*pud));
> > +     if (pud_none(*pud) || pud_bad(*pud))
> > +             return NULL;
> > +     pmd = pmd_offset(pud, addr);
> > +
> > +     return pmd;
> > +}
>
> That routine is not really hugetlb specific.  Perhaps we could move it
> to sparse-vmemmap.c?  Or elsewhere?

Yeah, we can move it to sparse-vmemmap.c, maybe better.

>
> --
> Mike Kravetz
Mike Kravetz Oct. 29, 2020, 9:59 p.m. UTC | #5
On 10/28/20 11:13 PM, Muchun Song wrote:
> On Thu, Oct 29, 2020 at 7:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 10/26/20 7:51 AM, Muchun Song wrote:
>>> +
>>> +static inline spinlock_t *vmemmap_pmd_lockptr(pmd_t *pmd)
>>> +{
>>> +     static DEFINE_SPINLOCK(pgtable_lock);
>>> +
>>> +     return &pgtable_lock;
>>> +}
>>
>> This is just a global lock.  Correct?  And hugetlb specific?
> 
> Yes, it is a global lock. Originally, I wanted to use the pmd lock(e.g.
> pmd_lockptr()). But we need to allocate memory for the spinlock and
> initialize it when ALLOC_SPLIT_PTLOCKS. It may increase the
> complexity.
> 
> And I think that here alloc/free hugetlb pages is not a frequent operation.
> So I finally use a global lock. Maybe it is enough.
> 
>>
>> It should be OK as the page table entries for huegtlb pages will not
>> overlap with other entries.
> 
> Does "hugetlb specific" mean the pmd lock? or per hugetlb lock?
> If it is pmd lock, this is fine to me. If not, it may not be enough.
> Because the lock also guards the splitting of pmd pgtable.

By "hugetlb specific", I was trying to say that only hugetlb code would
use this lock.  It is not a concern now.  However, there has been talk
about other code doing something similar to remove struct pages.  If that
ever happens then we will need a different locking scheme.

Disregard my statement about there being no overlap.  I was confusing
page tables for huge pages with page tables for mappings mmap entries
of huge pages.
Muchun Song Oct. 30, 2020, 2:58 a.m. UTC | #6
On Fri, Oct 30, 2020 at 6:00 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 10/28/20 11:13 PM, Muchun Song wrote:
> > On Thu, Oct 29, 2020 at 7:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>
> >> On 10/26/20 7:51 AM, Muchun Song wrote:
> >>> +
> >>> +static inline spinlock_t *vmemmap_pmd_lockptr(pmd_t *pmd)
> >>> +{
> >>> +     static DEFINE_SPINLOCK(pgtable_lock);
> >>> +
> >>> +     return &pgtable_lock;
> >>> +}
> >>
> >> This is just a global lock.  Correct?  And hugetlb specific?
> >
> > Yes, it is a global lock. Originally, I wanted to use the pmd lock(e.g.
> > pmd_lockptr()). But we need to allocate memory for the spinlock and
> > initialize it when ALLOC_SPLIT_PTLOCKS. It may increase the
> > complexity.
> >
> > And I think that here alloc/free hugetlb pages is not a frequent operation.
> > So I finally use a global lock. Maybe it is enough.
> >
> >>
> >> It should be OK as the page table entries for huegtlb pages will not
> >> overlap with other entries.
> >
> > Does "hugetlb specific" mean the pmd lock? or per hugetlb lock?
> > If it is pmd lock, this is fine to me. If not, it may not be enough.
> > Because the lock also guards the splitting of pmd pgtable.
>
> By "hugetlb specific", I was trying to say that only hugetlb code would
> use this lock.  It is not a concern now.  However, there has been talk
> about other code doing something similar to remove struct pages.  If that
> ever happens then we will need a different locking scheme.

Agreed, It is not a concern now :)

>
> Disregard my statement about there being no overlap.  I was confusing
> page tables for huge pages with page tables for mappings mmap entries
> of huge pages.
> --
> Mike Kravetz
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index f5e882f999cd..7c3eb60c2198 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -4,10 +4,17 @@ 
 
 #include <asm/page.h>
 #include <asm-generic/hugetlb.h>
+#include <asm/pgtable.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 #define VMEMMAP_HPAGE_SHIFT			PMD_SHIFT
 #define arch_vmemmap_support_huge_mapping()	boot_cpu_has(X86_FEATURE_PSE)
+
+#define vmemmap_pmd_huge vmemmap_pmd_huge
+static inline bool vmemmap_pmd_huge(pmd_t *pmd)
+{
+	return pmd_large(*pmd);
+}
 #endif
 
 #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f2240d..bedbd2e7d06c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -139,6 +139,14 @@  extern unsigned int ptrs_per_p4d;
 # define VMEMMAP_START		__VMEMMAP_BASE_L4
 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
 
+/*
+ * VMEMMAP_SIZE - allows the whole linear region to be covered by
+ *                a struct page array.
+ */
+#define VMEMMAP_SIZE		(1UL << (__VIRTUAL_MASK_SHIFT - PAGE_SHIFT - \
+					 1 + ilog2(sizeof(struct page))))
+#define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
+
 #define VMALLOC_END		(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
 
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ace304a6196c..919f47d77117 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -601,6 +601,13 @@  static inline bool arch_vmemmap_support_huge_mapping(void)
 }
 #endif
 
+#ifndef vmemmap_pmd_huge
+static inline bool vmemmap_pmd_huge(pmd_t *pmd)
+{
+	return pmd_huge(*pmd);
+}
+#endif
+
 #ifndef VMEMMAP_HPAGE_SHIFT
 #define VMEMMAP_HPAGE_SHIFT		PMD_SHIFT
 #endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d6ae9b6876be..aa012d603e06 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1293,10 +1293,20 @@  static inline void destroy_compound_gigantic_page(struct page *page,
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#include <linux/bootmem_info.h>
+
 #define RESERVE_VMEMMAP_NR	2U
+#define RESERVE_VMEMMAP_SIZE	(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
 #define page_huge_pte(page)	((page)->pmd_huge_pte)
 
+#define vmemmap_hpage_addr_end(addr, end)				\
+({									\
+	unsigned long __boundary;					\
+	__boundary = ((addr) + VMEMMAP_HPAGE_SIZE) & VMEMMAP_HPAGE_MASK;\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
+})
+
 static inline unsigned int nr_free_vmemmap(struct hstate *h)
 {
 	return h->nr_free_vmemmap_pages;
@@ -1416,6 +1426,181 @@  static void __init hugetlb_vmemmap_init(struct hstate *h)
 	pr_info("HugeTLB: can free %d vmemmap pages for %s\n",
 		h->nr_free_vmemmap_pages, h->name);
 }
+
+static inline spinlock_t *vmemmap_pmd_lockptr(pmd_t *pmd)
+{
+	static DEFINE_SPINLOCK(pgtable_lock);
+
+	return &pgtable_lock;
+}
+
+/*
+ * Walk a vmemmap address to the pmd it maps.
+ */
+static pmd_t *vmemmap_to_pmd(const void *page)
+{
+	unsigned long addr = (unsigned long)page;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (addr < VMEMMAP_START || addr >= VMEMMAP_END)
+		return NULL;
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return NULL;
+	pud = pud_offset(p4d, addr);
+
+	WARN_ON_ONCE(pud_bad(*pud));
+	if (pud_none(*pud) || pud_bad(*pud))
+		return NULL;
+	pmd = pmd_offset(pud, addr);
+
+	return pmd;
+}
+
+static inline int freed_vmemmap_hpage(struct page *page)
+{
+	return atomic_read(&page->_mapcount) + 1;
+}
+
+static inline int freed_vmemmap_hpage_inc(struct page *page)
+{
+	return atomic_inc_return_relaxed(&page->_mapcount) + 1;
+}
+
+static inline int freed_vmemmap_hpage_dec(struct page *page)
+{
+	return atomic_dec_return_relaxed(&page->_mapcount) + 1;
+}
+
+static inline void free_vmemmap_page_list(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		list_del(&page->lru);
+		free_vmemmap_page(page);
+	}
+}
+
+static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
+					 unsigned long start,
+					 unsigned int nr_free,
+					 struct list_head *free_pages)
+{
+	pte_t entry = mk_pte(reuse, PAGE_KERNEL);
+	unsigned long addr;
+	unsigned long end = start + (nr_free << PAGE_SHIFT);
+
+	for (addr = start; addr < end; addr += PAGE_SIZE, ptep++) {
+		struct page *page;
+		pte_t old = *ptep;
+
+		VM_WARN_ON(!pte_present(old));
+		page = pte_page(old);
+		list_add(&page->lru, free_pages);
+
+		set_pte_at(&init_mm, addr, ptep, entry);
+	}
+}
+
+static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
+					 unsigned long addr,
+					 struct list_head *free_pages)
+{
+	unsigned long next;
+	unsigned long start = addr + RESERVE_VMEMMAP_NR * PAGE_SIZE;
+	unsigned long end = addr + nr_vmemmap_size(h);
+	struct page *reuse = NULL;
+
+	addr = start;
+	do {
+		unsigned int nr_pages;
+		pte_t *ptep;
+
+		ptep = pte_offset_kernel(pmd, addr);
+		if (!reuse)
+			reuse = pte_page(ptep[-1]);
+
+		next = vmemmap_hpage_addr_end(addr, end);
+		nr_pages = (next - addr) >> PAGE_SHIFT;
+		__free_huge_page_pte_vmemmap(reuse, ptep, addr, nr_pages,
+					     free_pages);
+	} while (pmd++, addr = next, addr != end);
+
+	flush_tlb_kernel_range(start, end);
+}
+
+static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long addr)
+{
+	struct mm_struct *mm = &init_mm;
+	struct page *page;
+	pmd_t old_pmd, _pmd;
+	int i;
+
+	old_pmd = READ_ONCE(*pmd);
+	page = pmd_page(old_pmd);
+	pmd_populate_kernel(mm, &_pmd, pte_p);
+
+	for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
+		pte_t entry, *pte;
+
+		entry = mk_pte(page + i, PAGE_KERNEL);
+		pte = pte_offset_kernel(&_pmd, addr);
+		VM_BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, addr, pte, entry);
+	}
+
+	/* make pte visible before pmd */
+	smp_wmb();
+	pmd_populate_kernel(mm, pmd, pte_p);
+}
+
+static void split_vmemmap_huge_page(struct page *head, pmd_t *pmd)
+{
+	pte_t *pte_p;
+	unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
+	unsigned long addr = start;
+
+	while ((pte_p = vmemmap_pgtable_withdraw(head))) {
+		VM_BUG_ON(freed_vmemmap_hpage(virt_to_page(pte_p)));
+		split_vmemmap_pmd(pmd++, pte_p, addr);
+		addr += VMEMMAP_HPAGE_SIZE;
+	}
+
+	flush_tlb_kernel_range(start, addr);
+}
+
+static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	pmd_t *pmd;
+	spinlock_t *ptl;
+	LIST_HEAD(free_pages);
+
+	if (!nr_free_vmemmap(h))
+		return;
+
+	pmd = vmemmap_to_pmd(head);
+	ptl = vmemmap_pmd_lockptr(pmd);
+
+	spin_lock(ptl);
+	if (vmemmap_pmd_huge(pmd)) {
+		VM_BUG_ON(!nr_pgtable(h));
+		split_vmemmap_huge_page(head, pmd);
+	}
+
+	__free_huge_page_pmd_vmemmap(h, pmd, (unsigned long)head, &free_pages);
+	freed_vmemmap_hpage_inc(pmd_page(*pmd));
+	spin_unlock(ptl);
+
+	free_vmemmap_page_list(&free_pages);
+}
 #else
 static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
@@ -1429,6 +1614,10 @@  static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
 static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
 {
 }
+
+static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1637,6 +1826,7 @@  void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+	free_huge_page_vmemmap(h, page);
 	/* Must be called before the initialization of @page->lru */
 	vmemmap_pgtable_free(h, page);