diff mbox series

[v3,09/21] mm/hugetlb: Free the vmemmap pages associated with each hugetlb page

Message ID 20201108141113.65450-10-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of hugetlb page | expand

Commit Message

Muchun Song Nov. 8, 2020, 2:11 p.m. UTC
When we allocate a hugetlb page from the buddy, we should free the
unused vmemmap pages associated with it. We can do that in the
prep_new_huge_page().

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/x86/include/asm/hugetlb.h          |   9 ++
 arch/x86/include/asm/pgtable_64_types.h |   8 ++
 include/linux/hugetlb.h                 |   8 ++
 include/linux/mm.h                      |   4 +
 mm/hugetlb.c                            | 166 ++++++++++++++++++++++++++++++++
 mm/sparse-vmemmap.c                     |  31 ++++++
 6 files changed, 226 insertions(+)

Comments

Oscar Salvador Nov. 9, 2020, 6:51 p.m. UTC | #1
On Sun, Nov 08, 2020 at 10:11:01PM +0800, Muchun Song wrote:
> +static inline int freed_vmemmap_hpage(struct page *page)
> +{
> +	return atomic_read(&page->_mapcount) + 1;
> +}
> +
> +static inline int freed_vmemmap_hpage_inc(struct page *page)
> +{
> +	return atomic_inc_return_relaxed(&page->_mapcount) + 1;
> +}
> +
> +static inline int freed_vmemmap_hpage_dec(struct page *page)
> +{
> +	return atomic_dec_return_relaxed(&page->_mapcount) + 1;
> +}

Are these relaxed any different that the normal ones on x86_64? 
I got confused following the macros.

> +static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
> +					 unsigned long start,
> +					 unsigned int nr_free,
> +					 struct list_head *free_pages)
> +{
> +	/* Make the tail pages are mapped read-only. */
> +	pgprot_t pgprot = PAGE_KERNEL_RO;
> +	pte_t entry = mk_pte(reuse, pgprot);
> +	unsigned long addr;
> +	unsigned long end = start + (nr_free << PAGE_SHIFT);

See below.

> +static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
> +					 unsigned long addr,
> +					 struct list_head *free_pages)
> +{
> +	unsigned long next;
> +	unsigned long start = addr + RESERVE_VMEMMAP_NR * PAGE_SIZE;
> +	unsigned long end = addr + vmemmap_pages_size_per_hpage(h);
> +	struct page *reuse = NULL;
> +
> +	addr = start;
> +	do {
> +		unsigned int nr_pages;
> +		pte_t *ptep;
> +
> +		ptep = pte_offset_kernel(pmd, addr);
> +		if (!reuse)
> +			reuse = pte_page(ptep[-1]);

Can we define a proper name for that instead of -1?

e.g: TAIL_PAGE_REUSE or something like that. 

> +
> +		next = vmemmap_hpage_addr_end(addr, end);
> +		nr_pages = (next - addr) >> PAGE_SHIFT;
> +		__free_huge_page_pte_vmemmap(reuse, ptep, addr, nr_pages,
> +					     free_pages);

Why not passing next instead of nr_pages? I think it makes more sense.
As a bonus we can kill the variable.

> +static void split_vmemmap_huge_page(struct hstate *h, struct page *head,
> +				    pmd_t *pmd)
> +{
> +	pgtable_t pgtable;
> +	unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
> +	unsigned long addr = start;
> +	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> +
> +	while (nr-- && (pgtable = vmemmap_pgtable_withdraw(head))) {

The same with previous patches, I would scrap "nr" and its use.

> +		VM_BUG_ON(freed_vmemmap_hpage(pgtable));

I guess here we want to check whether we already call free_huge_page_vmemmap
on this range?
For this to have happened, the locking should have failed, right?

> +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> +{
> +	pmd_t *pmd;
> +	spinlock_t *ptl;
> +	LIST_HEAD(free_pages);
> +
> +	if (!free_vmemmap_pages_per_hpage(h))
> +		return;
> +
> +	pmd = vmemmap_to_pmd(head);
> +	ptl = vmemmap_pmd_lock(pmd);
> +	if (vmemmap_pmd_huge(pmd)) {
> +		VM_BUG_ON(!pgtable_pages_to_prealloc_per_hpage(h));

I think that checking for free_vmemmap_pages_per_hpage is enough.
In the end, pgtable_pages_to_prealloc_per_hpage uses free_vmemmap_pages_per_hpage.
Muchun Song Nov. 10, 2020, 6:40 a.m. UTC | #2
On Tue, Nov 10, 2020 at 2:51 AM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Sun, Nov 08, 2020 at 10:11:01PM +0800, Muchun Song wrote:
> > +static inline int freed_vmemmap_hpage(struct page *page)
> > +{
> > +     return atomic_read(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_inc(struct page *page)
> > +{
> > +     return atomic_inc_return_relaxed(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_dec(struct page *page)
> > +{
> > +     return atomic_dec_return_relaxed(&page->_mapcount) + 1;
> > +}
>
> Are these relaxed any different that the normal ones on x86_64?
> I got confused following the macros.

A PTE table can contain 64 HugeTLB(2MB) page's struct page structures.
So I use the freed_vmemmap_hpage to indicate how many HugeTLB pages
that it's vmemmap pages are already freed to buddy.

Once vmemmap pages of a HugeTLB page are freed, we call the
freed_vmemmap_hpage_inc, when freeing a HugeTLB to the buddy,
we should call freed_vmemmap_hpage_dec.

If the freed_vmemmap_hpage hit zero when free HugeTLB, we try to merge
the PTE table to PMD(now only support gigantic pages). This can refer to

  [PATCH v3 19/21] mm/hugetlb: Merge pte to huge pmd only for gigantic

Thanks.

>
> > +static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
> > +                                      unsigned long start,
> > +                                      unsigned int nr_free,
> > +                                      struct list_head *free_pages)
> > +{
> > +     /* Make the tail pages are mapped read-only. */
> > +     pgprot_t pgprot = PAGE_KERNEL_RO;
> > +     pte_t entry = mk_pte(reuse, pgprot);
> > +     unsigned long addr;
> > +     unsigned long end = start + (nr_free << PAGE_SHIFT);
>
> See below.
>
> > +static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
> > +                                      unsigned long addr,
> > +                                      struct list_head *free_pages)
> > +{
> > +     unsigned long next;
> > +     unsigned long start = addr + RESERVE_VMEMMAP_NR * PAGE_SIZE;
> > +     unsigned long end = addr + vmemmap_pages_size_per_hpage(h);
> > +     struct page *reuse = NULL;
> > +
> > +     addr = start;
> > +     do {
> > +             unsigned int nr_pages;
> > +             pte_t *ptep;
> > +
> > +             ptep = pte_offset_kernel(pmd, addr);
> > +             if (!reuse)
> > +                     reuse = pte_page(ptep[-1]);
>
> Can we define a proper name for that instead of -1?
>
> e.g: TAIL_PAGE_REUSE or something like that.

OK, will do.

>
> > +
> > +             next = vmemmap_hpage_addr_end(addr, end);
> > +             nr_pages = (next - addr) >> PAGE_SHIFT;
> > +             __free_huge_page_pte_vmemmap(reuse, ptep, addr, nr_pages,
> > +                                          free_pages);
>
> Why not passing next instead of nr_pages? I think it makes more sense.
> As a bonus we can kill the variable.

Good catch. We can pass next instead of nr_pages. Thanks.


>
> > +static void split_vmemmap_huge_page(struct hstate *h, struct page *head,
> > +                                 pmd_t *pmd)
> > +{
> > +     pgtable_t pgtable;
> > +     unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
> > +     unsigned long addr = start;
> > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > +
> > +     while (nr-- && (pgtable = vmemmap_pgtable_withdraw(head))) {
>
> The same with previous patches, I would scrap "nr" and its use.
>
> > +             VM_BUG_ON(freed_vmemmap_hpage(pgtable));
>
> I guess here we want to check whether we already call free_huge_page_vmemmap
> on this range?
> For this to have happened, the locking should have failed, right?

Only the first HugeTLB page should split the PMD to PTE. The other 63
HugeTLB pages
do not need to split. Here I want to make sure we are the first.

>
> > +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > +{
> > +     pmd_t *pmd;
> > +     spinlock_t *ptl;
> > +     LIST_HEAD(free_pages);
> > +
> > +     if (!free_vmemmap_pages_per_hpage(h))
> > +             return;
> > +
> > +     pmd = vmemmap_to_pmd(head);
> > +     ptl = vmemmap_pmd_lock(pmd);
> > +     if (vmemmap_pmd_huge(pmd)) {
> > +             VM_BUG_ON(!pgtable_pages_to_prealloc_per_hpage(h));
>
> I think that checking for free_vmemmap_pages_per_hpage is enough.
> In the end, pgtable_pages_to_prealloc_per_hpage uses free_vmemmap_pages_per_hpage.

The free_vmemmap_pages_per_hpage is not enough. See the comments above.

Thanks.

>
>
> --
> Oscar Salvador
> SUSE L3



--
Yours,
Muchun
Oscar Salvador Nov. 10, 2020, 9:48 a.m. UTC | #3
On Tue, Nov 10, 2020 at 02:40:54PM +0800, Muchun Song wrote:
> Only the first HugeTLB page should split the PMD to PTE. The other 63
> HugeTLB pages
> do not need to split. Here I want to make sure we are the first.

I think terminology is loosing me here.

Say you allocate a 2MB HugeTLB page at ffffea0004100000.

The vmemmap range that the represents this is ffffea0004000000 - ffffea0004200000.
That is a 2MB chunk PMD-mapped.
So, in order to free some of those vmemmap pages, we need to break down
that area, remapping it to PTE-based.
I know what you mean, but we are not really splitting hugetlg pages, but
the memmap range they are represented with.

About:

"Only the first HugeTLB page should split the PMD to PTE. The other 63
HugeTLB pages
do not need to split. Here I want to make sure we are the first."

That only refers to gigantic pages, right?

> > > +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > > +{
> > > +     pmd_t *pmd;
> > > +     spinlock_t *ptl;
> > > +     LIST_HEAD(free_pages);
> > > +
> > > +     if (!free_vmemmap_pages_per_hpage(h))
> > > +             return;
> > > +
> > > +     pmd = vmemmap_to_pmd(head);
> > > +     ptl = vmemmap_pmd_lock(pmd);
> > > +     if (vmemmap_pmd_huge(pmd)) {
> > > +             VM_BUG_ON(!pgtable_pages_to_prealloc_per_hpage(h));
> >
> > I think that checking for free_vmemmap_pages_per_hpage is enough.
> > In the end, pgtable_pages_to_prealloc_per_hpage uses free_vmemmap_pages_per_hpage.
> 
> The free_vmemmap_pages_per_hpage is not enough. See the comments above.

My comment was about the VM_BUG_ON.
Muchun Song Nov. 10, 2020, 10:47 a.m. UTC | #4
On Tue, Nov 10, 2020 at 5:48 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Tue, Nov 10, 2020 at 02:40:54PM +0800, Muchun Song wrote:
> > Only the first HugeTLB page should split the PMD to PTE. The other 63
> > HugeTLB pages
> > do not need to split. Here I want to make sure we are the first.
>
> I think terminology is loosing me here.
>
> Say you allocate a 2MB HugeTLB page at ffffea0004100000.
>
> The vmemmap range that the represents this is ffffea0004000000 - ffffea0004200000.
> That is a 2MB chunk PMD-mapped.
> So, in order to free some of those vmemmap pages, we need to break down
> that area, remapping it to PTE-based.
> I know what you mean, but we are not really splitting hugetlg pages, but
> the memmap range they are represented with.

Yeah, you are right. We are splitting the vmemmap instead of hugetlb.
Sorry for the confusion.

>
> About:
>
> "Only the first HugeTLB page should split the PMD to PTE. The other 63
> HugeTLB pages
> do not need to split. Here I want to make sure we are the first."
>
> That only refers to gigantic pages, right?

Yeah, now it only refers to gigantic pages. Originally, I also wanted to merge
vmemmap PTE to PMD for normal 2MB HugeTLB pages. So I introduced
those macros(e.g. freed_vmemmap_hpage). For 2MB HugeTLB pages, I
haven't found an elegant solution. Hopefully, when you or someone have
read all of the patch series, we can come up with an elegant solution to
merge PTE.

Thanks.

>
> > > > +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > > > +{
> > > > +     pmd_t *pmd;
> > > > +     spinlock_t *ptl;
> > > > +     LIST_HEAD(free_pages);
> > > > +
> > > > +     if (!free_vmemmap_pages_per_hpage(h))
> > > > +             return;
> > > > +
> > > > +     pmd = vmemmap_to_pmd(head);
> > > > +     ptl = vmemmap_pmd_lock(pmd);
> > > > +     if (vmemmap_pmd_huge(pmd)) {
> > > > +             VM_BUG_ON(!pgtable_pages_to_prealloc_per_hpage(h));
> > >
> > > I think that checking for free_vmemmap_pages_per_hpage is enough.
> > > In the end, pgtable_pages_to_prealloc_per_hpage uses free_vmemmap_pages_per_hpage.
> >
> > The free_vmemmap_pages_per_hpage is not enough. See the comments above.
>
> My comment was about the VM_BUG_ON.

Sorry, yeah, we can drop it. Thanks.

>
>
> --
> Oscar Salvador
> SUSE L3
Oscar Salvador Nov. 10, 2020, 1:52 p.m. UTC | #5
On Tue, Nov 10, 2020 at 06:47:08PM +0800, Muchun Song wrote:
> > That only refers to gigantic pages, right?
> 
> Yeah, now it only refers to gigantic pages. Originally, I also wanted to merge
> vmemmap PTE to PMD for normal 2MB HugeTLB pages. So I introduced
> those macros(e.g. freed_vmemmap_hpage). For 2MB HugeTLB pages, I
> haven't found an elegant solution. Hopefully, when you or someone have
> read all of the patch series, we can come up with an elegant solution to
> merge PTE.

Well, it is quite a lot of "tricky" code, so it takes some time.

> > > > > +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > > > > +{
> > > > > +     pmd_t *pmd;
> > > > > +     spinlock_t *ptl;
> > > > > +     LIST_HEAD(free_pages);
> > > > > +
> > > > > +     if (!free_vmemmap_pages_per_hpage(h))
> > > > > +             return;
> > > > > +
> > > > > +     pmd = vmemmap_to_pmd(head);
> > > > > +     ptl = vmemmap_pmd_lock(pmd);

I forgot about this one.
You might want to check whether vmemmap_to_pmd returns NULL or not.
If it does means that something went wrong anyways, but still we should handle
such case (and print a fat warning or something like that).
Muchun Song Nov. 10, 2020, 2 p.m. UTC | #6
On Tue, Nov 10, 2020 at 9:52 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Tue, Nov 10, 2020 at 06:47:08PM +0800, Muchun Song wrote:
> > > That only refers to gigantic pages, right?
> >
> > Yeah, now it only refers to gigantic pages. Originally, I also wanted to merge
> > vmemmap PTE to PMD for normal 2MB HugeTLB pages. So I introduced
> > those macros(e.g. freed_vmemmap_hpage). For 2MB HugeTLB pages, I
> > haven't found an elegant solution. Hopefully, when you or someone have
> > read all of the patch series, we can come up with an elegant solution to
> > merge PTE.
>
> Well, it is quite a lot of "tricky" code, so it takes some time.
>
> > > > > > +static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > > > > > +{
> > > > > > +     pmd_t *pmd;
> > > > > > +     spinlock_t *ptl;
> > > > > > +     LIST_HEAD(free_pages);
> > > > > > +
> > > > > > +     if (!free_vmemmap_pages_per_hpage(h))
> > > > > > +             return;
> > > > > > +
> > > > > > +     pmd = vmemmap_to_pmd(head);
> > > > > > +     ptl = vmemmap_pmd_lock(pmd);
>
> I forgot about this one.
> You might want to check whether vmemmap_to_pmd returns NULL or not.
> If it does means that something went wrong anyways, but still we should handle
> such case (and print a fat warning or something like that).

Yeah, maybe add a BUG_ON or WARN_ON. Do you think which one
would be more suitable?


>
>
> --
> Oscar Salvador
> SUSE L3



--
Yours,
Muchun
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 1721b1aadeb1..c601fe042832 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -4,6 +4,15 @@ 
 
 #include <asm/page.h>
 #include <asm-generic/hugetlb.h>
+#include <asm/pgtable.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#define vmemmap_pmd_huge vmemmap_pmd_huge
+static inline bool vmemmap_pmd_huge(pmd_t *pmd)
+{
+	return pmd_large(*pmd);
+}
+#endif
 
 #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f2240d..bedbd2e7d06c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -139,6 +139,14 @@  extern unsigned int ptrs_per_p4d;
 # define VMEMMAP_START		__VMEMMAP_BASE_L4
 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
 
+/*
+ * VMEMMAP_SIZE - allows the whole linear region to be covered by
+ *                a struct page array.
+ */
+#define VMEMMAP_SIZE		(1UL << (__VIRTUAL_MASK_SHIFT - PAGE_SHIFT - \
+					 1 + ilog2(sizeof(struct page))))
+#define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
+
 #define VMALLOC_END		(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
 
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d81c262418db..afb9b18771c4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -594,6 +594,14 @@  static inline unsigned int blocks_per_huge_page(struct hstate *h)
 #include <asm/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifndef vmemmap_pmd_huge
+#define vmemmap_pmd_huge vmemmap_pmd_huge
+static inline bool vmemmap_pmd_huge(pmd_t *pmd)
+{
+	return pmd_huge(*pmd);
+}
+#endif
+
 #ifndef VMEMMAP_HPAGE_SHIFT
 #define VMEMMAP_HPAGE_SHIFT		HPAGE_SHIFT
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce429614d1ab..480faca94c23 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3025,6 +3025,10 @@  static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+pmd_t *vmemmap_to_pmd(const void *page);
+#endif
+
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5c7be2ee7e15..27f0269aab70 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1293,6 +1293,8 @@  static inline void destroy_compound_gigantic_page(struct page *page,
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#include <linux/bootmem_info.h>
+
 /*
  * There are 512 struct page structs(8 pages) associated with each 2MB
  * hugetlb page. For tail pages, the value of compound_dtor is the same.
@@ -1305,6 +1307,13 @@  static inline void destroy_compound_gigantic_page(struct page *page,
 
 #define page_huge_pte(page)	((page)->pmd_huge_pte)
 
+#define vmemmap_hpage_addr_end(addr, end)				\
+({									\
+	unsigned long __boundary;					\
+	__boundary = ((addr) + VMEMMAP_HPAGE_SIZE) & VMEMMAP_HPAGE_MASK;\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
+})
+
 static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 {
 	return h->nr_free_vmemmap_pages;
@@ -1424,6 +1433,147 @@  static void __init hugetlb_vmemmap_init(struct hstate *h)
 	pr_debug("HugeTLB: can free %d vmemmap pages for %s\n",
 		 h->nr_free_vmemmap_pages, h->name);
 }
+
+static inline int freed_vmemmap_hpage(struct page *page)
+{
+	return atomic_read(&page->_mapcount) + 1;
+}
+
+static inline int freed_vmemmap_hpage_inc(struct page *page)
+{
+	return atomic_inc_return_relaxed(&page->_mapcount) + 1;
+}
+
+static inline int freed_vmemmap_hpage_dec(struct page *page)
+{
+	return atomic_dec_return_relaxed(&page->_mapcount) + 1;
+}
+
+static inline void free_vmemmap_page_list(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		list_del(&page->lru);
+		free_vmemmap_page(page);
+	}
+}
+
+static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
+					 unsigned long start,
+					 unsigned int nr_free,
+					 struct list_head *free_pages)
+{
+	/* Make the tail pages are mapped read-only. */
+	pgprot_t pgprot = PAGE_KERNEL_RO;
+	pte_t entry = mk_pte(reuse, pgprot);
+	unsigned long addr;
+	unsigned long end = start + (nr_free << PAGE_SHIFT);
+
+	for (addr = start; addr < end; addr += PAGE_SIZE, ptep++) {
+		struct page *page;
+		pte_t old = *ptep;
+
+		VM_WARN_ON(!pte_present(old));
+		page = pte_page(old);
+		list_add(&page->lru, free_pages);
+
+		set_pte_at(&init_mm, addr, ptep, entry);
+	}
+}
+
+static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
+					 unsigned long addr,
+					 struct list_head *free_pages)
+{
+	unsigned long next;
+	unsigned long start = addr + RESERVE_VMEMMAP_NR * PAGE_SIZE;
+	unsigned long end = addr + vmemmap_pages_size_per_hpage(h);
+	struct page *reuse = NULL;
+
+	addr = start;
+	do {
+		unsigned int nr_pages;
+		pte_t *ptep;
+
+		ptep = pte_offset_kernel(pmd, addr);
+		if (!reuse)
+			reuse = pte_page(ptep[-1]);
+
+		next = vmemmap_hpage_addr_end(addr, end);
+		nr_pages = (next - addr) >> PAGE_SHIFT;
+		__free_huge_page_pte_vmemmap(reuse, ptep, addr, nr_pages,
+					     free_pages);
+	} while (pmd++, addr = next, addr != end);
+
+	flush_tlb_kernel_range(start, end);
+}
+
+static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long addr)
+{
+	int i;
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct mm_struct *mm = &init_mm;
+	struct page *page;
+	pmd_t old_pmd, _pmd;
+
+	old_pmd = READ_ONCE(*pmd);
+	page = pmd_page(old_pmd);
+	pmd_populate_kernel(mm, &_pmd, pte_p);
+
+	for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
+		pte_t entry, *pte;
+
+		entry = mk_pte(page + i, pgprot);
+		pte = pte_offset_kernel(&_pmd, addr);
+		VM_BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, addr, pte, entry);
+	}
+
+	/* make pte visible before pmd */
+	smp_wmb();
+	pmd_populate_kernel(mm, pmd, pte_p);
+}
+
+static void split_vmemmap_huge_page(struct hstate *h, struct page *head,
+				    pmd_t *pmd)
+{
+	pgtable_t pgtable;
+	unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
+	unsigned long addr = start;
+	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
+
+	while (nr-- && (pgtable = vmemmap_pgtable_withdraw(head))) {
+		VM_BUG_ON(freed_vmemmap_hpage(pgtable));
+		split_vmemmap_pmd(pmd++, page_to_virt(pgtable), addr);
+		addr += VMEMMAP_HPAGE_SIZE;
+	}
+
+	flush_tlb_kernel_range(start, addr);
+}
+
+static void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	pmd_t *pmd;
+	spinlock_t *ptl;
+	LIST_HEAD(free_pages);
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	pmd = vmemmap_to_pmd(head);
+	ptl = vmemmap_pmd_lock(pmd);
+	if (vmemmap_pmd_huge(pmd)) {
+		VM_BUG_ON(!pgtable_pages_to_prealloc_per_hpage(h));
+		split_vmemmap_huge_page(h, head, pmd);
+	}
+
+	__free_huge_page_pmd_vmemmap(h, pmd, (unsigned long)head, &free_pages);
+	freed_vmemmap_hpage_inc(pmd_page(*pmd));
+	spin_unlock(ptl);
+
+	free_vmemmap_page_list(&free_pages);
+}
 #else
 static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
@@ -1437,6 +1587,10 @@  static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
 static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
 {
 }
+
+static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1645,6 +1799,10 @@  void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+	free_huge_page_vmemmap(h, page);
+	/* Must be called before the initialization of @page->lru */
+	vmemmap_pgtable_free(h, page);
+
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	set_hugetlb_cgroup(page, NULL);
@@ -1897,6 +2055,14 @@  static struct page *alloc_fresh_huge_page(struct hstate *h,
 	if (!page)
 		return NULL;
 
+	if (vmemmap_pgtable_prealloc(h, page)) {
+		if (hstate_is_gigantic(h))
+			free_gigantic_page(page, huge_page_order(h));
+		else
+			put_page(page);
+		return NULL;
+	}
+
 	if (hstate_is_gigantic(h))
 		prep_compound_gigantic_page(page, huge_page_order(h));
 	prep_new_huge_page(h, page, page_to_nid(page));
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16183d85a7d5..4b35d1655a2f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -263,3 +263,34 @@  struct page * __meminit __populate_section_memmap(unsigned long pfn,
 
 	return pfn_to_page(pfn);
 }
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+/*
+ * Walk a vmemmap address to the pmd it maps.
+ */
+pmd_t *vmemmap_to_pmd(const void *page)
+{
+	unsigned long addr = (unsigned long)page;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (addr < VMEMMAP_START || addr >= VMEMMAP_END)
+		return NULL;
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return NULL;
+	pud = pud_offset(p4d, addr);
+
+	if (pud_none(*pud) || pud_bad(*pud))
+		return NULL;
+	pmd = pmd_offset(pud, addr);
+
+	return pmd;
+}
+#endif