diff mbox series

[v9,05/11] mm/hugetlb: Allocate the vmemmap pages associated with each HugeTLB page

Message ID 20201213154534.54826-6-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of HugeTLB page | expand

Commit Message

Muchun Song Dec. 13, 2020, 3:45 p.m. UTC
When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. We can do that in the __free_hugepage()
before freeing it to buddy.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h   |  1 +
 mm/hugetlb.c         |  2 ++
 mm/hugetlb_vmemmap.c | 11 +++++++++
 mm/hugetlb_vmemmap.h |  5 ++++
 mm/sparse-vmemmap.c  | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 87 insertions(+), 1 deletion(-)

Comments

Mike Kravetz Dec. 17, 2020, 1:17 a.m. UTC | #1
On 12/13/20 7:45 AM, Muchun Song wrote:
> When we free a HugeTLB page to the buddy allocator, we should allocate the
> vmemmap pages associated with it. We can do that in the __free_hugepage()
> before freeing it to buddy.

...

> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index 78c527617e8d..ffcf092c92ed 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched.h>
>  #include <linux/pgtable.h>
>  #include <linux/bootmem_info.h>
> +#include <linux/delay.h>
>  
>  #include <asm/dma.h>
>  #include <asm/pgalloc.h>
> @@ -39,7 +40,8 @@
>   *
>   * @rmap_pte:		called for each non-empty PTE (lowest-level) entry.
>   * @reuse:		the page which is reused for the tail vmemmap pages.
> - * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
> + * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
> + *			or is mapped from.
>   */
>  struct vmemmap_rmap_walk {
>  	void (*rmap_pte)(pte_t *pte, unsigned long addr,
> @@ -54,6 +56,9 @@ struct vmemmap_rmap_walk {
>   */
>  #define VMEMMAP_TAIL_PAGE_REUSE		-1
>  
> +/* The gfp mask of allocating vmemmap page */
> +#define GFP_VMEMMAP_PAGE	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> +
>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  			      unsigned long end, struct vmemmap_rmap_walk *walk)
>  {
> @@ -200,6 +205,68 @@ void vmemmap_remap_reuse(unsigned long start, unsigned long size)
>  	free_vmemmap_page_list(&vmemmap_pages);
>  }
>  
> +static void vmemmap_remap_restore_pte(pte_t *pte, unsigned long addr,
> +				      struct vmemmap_rmap_walk *walk)
> +{
> +	pgprot_t pgprot = PAGE_KERNEL;
> +	struct page *page;
> +	void *to;
> +
> +	BUG_ON(pte_page(*pte) != walk->reuse);
> +
> +	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> +	list_del(&page->lru);
> +	to = page_to_virt(page);
> +	copy_page(to, page_to_virt(walk->reuse));
> +
> +	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +}
> +
> +static void alloc_vmemmap_page_list(struct list_head *list,
> +				    unsigned long nr_pages)
> +{
> +	while (nr_pages--) {
> +		struct page *page;
> +
> +retry:
> +		page = alloc_page(GFP_VMEMMAP_PAGE);

Should we try (or require) the vmemmap page be on the same node as the
pages they describe?  I imagine performance would be impacted if a
struct page and the page it describes are on different numa nodes.

> +		if (unlikely(!page)) {
> +			msleep(100);
> +			/*
> +			 * We should retry infinitely, because we cannot
> +			 * handle allocation failures. Once we allocate
> +			 * vmemmap pages successfully, then we can free
> +			 * a HugeTLB page.
> +			 */
> +			goto retry;
> +		}
> +		list_add_tail(&page->lru, list);
> +	}
> +}
> +
Muchun Song Dec. 17, 2020, 3:22 a.m. UTC | #2
On Thu, Dec 17, 2020 at 9:17 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 12/13/20 7:45 AM, Muchun Song wrote:
> > When we free a HugeTLB page to the buddy allocator, we should allocate the
> > vmemmap pages associated with it. We can do that in the __free_hugepage()
> > before freeing it to buddy.
>
> ...
>
> > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > index 78c527617e8d..ffcf092c92ed 100644
> > --- a/mm/sparse-vmemmap.c
> > +++ b/mm/sparse-vmemmap.c
> > @@ -29,6 +29,7 @@
> >  #include <linux/sched.h>
> >  #include <linux/pgtable.h>
> >  #include <linux/bootmem_info.h>
> > +#include <linux/delay.h>
> >
> >  #include <asm/dma.h>
> >  #include <asm/pgalloc.h>
> > @@ -39,7 +40,8 @@
> >   *
> >   * @rmap_pte:                called for each non-empty PTE (lowest-level) entry.
> >   * @reuse:           the page which is reused for the tail vmemmap pages.
> > - * @vmemmap_pages:   the list head of the vmemmap pages that can be freed.
> > + * @vmemmap_pages:   the list head of the vmemmap pages that can be freed
> > + *                   or is mapped from.
> >   */
> >  struct vmemmap_rmap_walk {
> >       void (*rmap_pte)(pte_t *pte, unsigned long addr,
> > @@ -54,6 +56,9 @@ struct vmemmap_rmap_walk {
> >   */
> >  #define VMEMMAP_TAIL_PAGE_REUSE              -1
> >
> > +/* The gfp mask of allocating vmemmap page */
> > +#define GFP_VMEMMAP_PAGE     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> > +
> >  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
> >                             unsigned long end, struct vmemmap_rmap_walk *walk)
> >  {
> > @@ -200,6 +205,68 @@ void vmemmap_remap_reuse(unsigned long start, unsigned long size)
> >       free_vmemmap_page_list(&vmemmap_pages);
> >  }
> >
> > +static void vmemmap_remap_restore_pte(pte_t *pte, unsigned long addr,
> > +                                   struct vmemmap_rmap_walk *walk)
> > +{
> > +     pgprot_t pgprot = PAGE_KERNEL;
> > +     struct page *page;
> > +     void *to;
> > +
> > +     BUG_ON(pte_page(*pte) != walk->reuse);
> > +
> > +     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> > +     list_del(&page->lru);
> > +     to = page_to_virt(page);
> > +     copy_page(to, page_to_virt(walk->reuse));
> > +
> > +     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> > +}
> > +
> > +static void alloc_vmemmap_page_list(struct list_head *list,
> > +                                 unsigned long nr_pages)
> > +{
> > +     while (nr_pages--) {
> > +             struct page *page;
> > +
> > +retry:
> > +             page = alloc_page(GFP_VMEMMAP_PAGE);
>
> Should we try (or require) the vmemmap page be on the same node as the
> pages they describe?  I imagine performance would be impacted if a
> struct page and the page it describes are on different numa nodes.

Yeah, it is a good idea. I also think that we should do this. I will do that in
the next version. Thanks.

>
> > +             if (unlikely(!page)) {
> > +                     msleep(100);
> > +                     /*
> > +                      * We should retry infinitely, because we cannot
> > +                      * handle allocation failures. Once we allocate
> > +                      * vmemmap pages successfully, then we can free
> > +                      * a HugeTLB page.
> > +                      */
> > +                     goto retry;
> > +             }
> > +             list_add_tail(&page->lru, list);
> > +     }
> > +}
> > +
>
> --
> Mike Kravetz
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab02e405a979..5b8dc36e4d20 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3006,6 +3006,7 @@  static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 void vmemmap_remap_reuse(unsigned long start, unsigned long size);
+void vmemmap_remap_restore(unsigned long start, unsigned long size);
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ff9b90e524f..542e6cb81321 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1362,6 +1362,8 @@  static void __free_hugepage(struct hstate *h, struct page *page)
 {
 	int i;
 
+	alloc_huge_page_vmemmap(h, page);
+
 	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
 				1 << PG_referenced | 1 << PG_dirty |
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 6d4e77a2b6c7..02201c2e3dfa 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -185,6 +185,17 @@  static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
 }
 
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_addr = (unsigned long)head;
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	vmemmap_remap_restore(vmemmap_addr + RESERVE_VMEMMAP_SIZE,
+			      free_vmemmap_pages_size_per_hpage(h));
+}
+
 void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 01f8637adbe0..b2c8d2f11d48 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@ 
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,10 @@  static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 	return 0;
 }
 #else
+static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 78c527617e8d..ffcf092c92ed 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,7 @@ 
 #include <linux/sched.h>
 #include <linux/pgtable.h>
 #include <linux/bootmem_info.h>
+#include <linux/delay.h>
 
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
@@ -39,7 +40,8 @@ 
  *
  * @rmap_pte:		called for each non-empty PTE (lowest-level) entry.
  * @reuse:		the page which is reused for the tail vmemmap pages.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
  */
 struct vmemmap_rmap_walk {
 	void (*rmap_pte)(pte_t *pte, unsigned long addr,
@@ -54,6 +56,9 @@  struct vmemmap_rmap_walk {
  */
 #define VMEMMAP_TAIL_PAGE_REUSE		-1
 
+/* The gfp mask of allocating vmemmap page */
+#define GFP_VMEMMAP_PAGE	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
+
 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 			      unsigned long end, struct vmemmap_rmap_walk *walk)
 {
@@ -200,6 +205,68 @@  void vmemmap_remap_reuse(unsigned long start, unsigned long size)
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void vmemmap_remap_restore_pte(pte_t *pte, unsigned long addr,
+				      struct vmemmap_rmap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, page_to_virt(walk->reuse));
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static void alloc_vmemmap_page_list(struct list_head *list,
+				    unsigned long nr_pages)
+{
+	while (nr_pages--) {
+		struct page *page;
+
+retry:
+		page = alloc_page(GFP_VMEMMAP_PAGE);
+		if (unlikely(!page)) {
+			msleep(100);
+			/*
+			 * We should retry infinitely, because we cannot
+			 * handle allocation failures. Once we allocate
+			 * vmemmap pages successfully, then we can free
+			 * a HugeTLB page.
+			 */
+			goto retry;
+		}
+		list_add_tail(&page->lru, list);
+	}
+}
+
+/**
+ * vmemmap_remap_restore - remap the vmemmap virtual address range
+ *                         [start, start + size) to the page respectively
+ *                         which from the @vmemmap_pages
+ * @start:	start address of the vmemmap virtual address range
+ * @end:	size of the vmemmap virtual address range
+ */
+void vmemmap_remap_restore(unsigned long start, unsigned long size)
+{
+	LIST_HEAD(vmemmap_pages);
+	unsigned long end = start + size;
+
+	struct vmemmap_rmap_walk walk = {
+		.rmap_pte	= vmemmap_remap_restore_pte,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	might_sleep();
+
+	alloc_vmemmap_page_list(&vmemmap_pages, size >> PAGE_SHIFT);
+	vmemmap_remap_range(start, end, &walk);
+}
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.