diff mbox series

[v8,06/12] mm/hugetlb: Allocate the vmemmap pages associated with each HugeTLB page

Message ID 20201210035526.38938-7-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of HugeTLB page | expand

Commit Message

Muchun Song Dec. 10, 2020, 3:55 a.m. UTC
When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. We can do that in the __free_hugepage()
before freeing it to buddy.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/hugetlb.c         |  2 ++
 mm/hugetlb_vmemmap.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++---
 mm/hugetlb_vmemmap.h |  5 +++
 3 files changed, 91 insertions(+), 5 deletions(-)

Comments

Oscar Salvador Dec. 11, 2020, 9:35 a.m. UTC | #1
On Thu, Dec 10, 2020 at 11:55:20AM +0800, Muchun Song wrote:
> When we free a HugeTLB page to the buddy allocator, we should allocate the
> vmemmap pages associated with it. We can do that in the __free_hugepage()
"vmemmap pages that describe the range" would look better to me, but it is ok.

> +#define GFP_VMEMMAP_PAGE		\
> +	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
>  
>  #ifndef VMEMMAP_HPAGE_SHIFT
>  #define VMEMMAP_HPAGE_SHIFT		HPAGE_SHIFT
> @@ -197,6 +200,11 @@
>  	(__boundary - 1 < (end) - 1) ? __boundary : (end);		 \
>  })
>  
> +typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
> +					 unsigned long start, unsigned long end,
> +					 void *priv);

Any reason to not have defined GFP_VMEMMAP_PAGE and the new typedef into
hugetlb_vmemmap.h?

  
> +static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
> +				      unsigned long start, unsigned long end,
> +				      void *priv)
> +{
> +	pgprot_t pgprot = PAGE_KERNEL;
> +	void *from = page_to_virt(reuse);
> +	unsigned long addr;
> +	struct list_head *pages = priv;
[...]
> +
> +		/*
> +		 * Make sure that any data that writes to the @to is made
> +		 * visible to the physical page.
> +		 */
> +		flush_kernel_vmap_range(to, PAGE_SIZE);

Correct me if I am wrong, but flush_kernel_vmap_range is a NOOP under arches which
do not have ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE.
Since we only enable support for x86_64, and x86_64 is one of those arches,
could we remove this, and introduced later on in case we enable this feature
on an arch that needs it?

I am not sure if you need to flush the range somehow, as you did in
vmemmap_remap_range.

> +retry:
> +		page = alloc_page(GFP_VMEMMAP_PAGE);
> +		if (unlikely(!page)) {
> +			msleep(100);
> +			/*
> +			 * We should retry infinitely, because we cannot
> +			 * handle allocation failures. Once we allocate
> +			 * vmemmap pages successfully, then we can free
> +			 * a HugeTLB page.
> +			 */
> +			goto retry;

I think this is the trickiest part.
With 2MB HugeTLB pages we only need 6 pages, but with 1GB, the number of pages
we need to allocate increases significantly (4088 pages IIRC).
And you are using __GFP_HIGH, which will allow us to use more memory (by
cutting down the watermark), but it might lead to putting the system
on its knees wrt. memory.
And yes, I know that once we allocate the 4088 pages, 1GB gets freed, but
still.

I would like to hear Michal's thoughts on this one, but I wonder if it makes
sense to not let 1GB-HugeTLB pages be freed.
David Hildenbrand Dec. 11, 2020, 10:52 a.m. UTC | #2
> Am 11.12.2020 um 10:35 schrieb Oscar Salvador <osalvador@suse.de>:
> 
> On Thu, Dec 10, 2020 at 11:55:20AM +0800, Muchun Song wrote:
>> When we free a HugeTLB page to the buddy allocator, we should allocate the
>> vmemmap pages associated with it. We can do that in the __free_hugepage()
> "vmemmap pages that describe the range" would look better to me, but it is ok.
> 
>> +#define GFP_VMEMMAP_PAGE        \
>> +    (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
>> 
>> #ifndef VMEMMAP_HPAGE_SHIFT
>> #define VMEMMAP_HPAGE_SHIFT        HPAGE_SHIFT
>> @@ -197,6 +200,11 @@
>>    (__boundary - 1 < (end) - 1) ? __boundary : (end);         \
>> })
>> 
>> +typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
>> +                     unsigned long start, unsigned long end,
>> +                     void *priv);
> 
> Any reason to not have defined GFP_VMEMMAP_PAGE and the new typedef into
> hugetlb_vmemmap.h?
> 
> 
>> +static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
>> +                      unsigned long start, unsigned long end,
>> +                      void *priv)
>> +{
>> +    pgprot_t pgprot = PAGE_KERNEL;
>> +    void *from = page_to_virt(reuse);
>> +    unsigned long addr;
>> +    struct list_head *pages = priv;
> [...]
>> +
>> +        /*
>> +         * Make sure that any data that writes to the @to is made
>> +         * visible to the physical page.
>> +         */
>> +        flush_kernel_vmap_range(to, PAGE_SIZE);
> 
> Correct me if I am wrong, but flush_kernel_vmap_range is a NOOP under arches which
> do not have ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE.
> Since we only enable support for x86_64, and x86_64 is one of those arches,
> could we remove this, and introduced later on in case we enable this feature
> on an arch that needs it?
> 
> I am not sure if you need to flush the range somehow, as you did in
> vmemmap_remap_range.
> 
>> +retry:
>> +        page = alloc_page(GFP_VMEMMAP_PAGE);
>> +        if (unlikely(!page)) {
>> +            msleep(100);
>> +            /*
>> +             * We should retry infinitely, because we cannot
>> +             * handle allocation failures. Once we allocate
>> +             * vmemmap pages successfully, then we can free
>> +             * a HugeTLB page.
>> +             */
>> +            goto retry;
> 
> I think this is the trickiest part.
> With 2MB HugeTLB pages we only need 6 pages, but with 1GB, the number of pages
> we need to allocate increases significantly (4088 pages IIRC).
> And you are using __GFP_HIGH, which will allow us to use more memory (by
> cutting down the watermark), but it might lead to putting the system
> on its knees wrt. memory.
> And yes, I know that once we allocate the 4088 pages, 1GB gets freed, but
> still.

Similar to memory hotplug, no? I don‘t think this is really an issue that cannot be mitigated. Yeah, we might want to tweak allocation flags.

> 
> I would like to hear Michal's thoughts on this one, but I wonder if it makes
> sense to not let 1GB-HugeTLB pages be freed.
> 
> -- 
> Oscar Salvador
> SUSE L3
>
Muchun Song Dec. 11, 2020, 1:01 p.m. UTC | #3
On Fri, Dec 11, 2020 at 5:35 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Thu, Dec 10, 2020 at 11:55:20AM +0800, Muchun Song wrote:
> > When we free a HugeTLB page to the buddy allocator, we should allocate the
> > vmemmap pages associated with it. We can do that in the __free_hugepage()
> "vmemmap pages that describe the range" would look better to me, but it is ok.

Thanks.

>
> > +#define GFP_VMEMMAP_PAGE             \
> > +     (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
> >
> >  #ifndef VMEMMAP_HPAGE_SHIFT
> >  #define VMEMMAP_HPAGE_SHIFT          HPAGE_SHIFT
> > @@ -197,6 +200,11 @@
> >       (__boundary - 1 < (end) - 1) ? __boundary : (end);               \
> >  })
> >
> > +typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
> > +                                      unsigned long start, unsigned long end,
> > +                                      void *priv);
>
> Any reason to not have defined GFP_VMEMMAP_PAGE and the new typedef into
> hugetlb_vmemmap.h?

Because they can only be used in this hugetlb_vmemmap.c.

>
>
> > +static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
> > +                                   unsigned long start, unsigned long end,
> > +                                   void *priv)
> > +{
> > +     pgprot_t pgprot = PAGE_KERNEL;
> > +     void *from = page_to_virt(reuse);
> > +     unsigned long addr;
> > +     struct list_head *pages = priv;
> [...]
> > +
> > +             /*
> > +              * Make sure that any data that writes to the @to is made
> > +              * visible to the physical page.
> > +              */
> > +             flush_kernel_vmap_range(to, PAGE_SIZE);
>
> Correct me if I am wrong, but flush_kernel_vmap_range is a NOOP under arches which
> do not have ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE.
> Since we only enable support for x86_64, and x86_64 is one of those arches,
> could we remove this, and introduced later on in case we enable this feature
> on an arch that needs it?

OK. Will remove.

>
> I am not sure if you need to flush the range somehow, as you did in
> vmemmap_remap_range.
>
> > +retry:
> > +             page = alloc_page(GFP_VMEMMAP_PAGE);
> > +             if (unlikely(!page)) {
> > +                     msleep(100);
> > +                     /*
> > +                      * We should retry infinitely, because we cannot
> > +                      * handle allocation failures. Once we allocate
> > +                      * vmemmap pages successfully, then we can free
> > +                      * a HugeTLB page.
> > +                      */
> > +                     goto retry;
>
> I think this is the trickiest part.
> With 2MB HugeTLB pages we only need 6 pages, but with 1GB, the number of pages
> we need to allocate increases significantly (4088 pages IIRC).
> And you are using __GFP_HIGH, which will allow us to use more memory (by
> cutting down the watermark), but it might lead to putting the system
> on its knees wrt. memory.
> And yes, I know that once we allocate the 4088 pages, 1GB gets freed, but
> still.

Yeah, it is a problem. How about removing __GFP_HIGH only for
1GB HugeTLB page?

>
> I would like to hear Michal's thoughts on this one, but I wonder if it makes
> sense to not let 1GB-HugeTLB pages be freed.
>
> --
> Oscar Salvador
> SUSE L3
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ff9b90e524f..542e6cb81321 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1362,6 +1362,8 @@  static void __free_hugepage(struct hstate *h, struct page *page)
 {
 	int i;
 
+	alloc_huge_page_vmemmap(h, page);
+
 	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
 				1 << PG_referenced | 1 << PG_dirty |
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d080488cde16..4587a0062808 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -169,6 +169,7 @@ 
 #define pr_fmt(fmt)	"HugeTLB vmemmap: " fmt
 
 #include <linux/bootmem_info.h>
+#include <linux/delay.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -181,6 +182,8 @@ 
 #define RESERVE_VMEMMAP_NR		2U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 #define VMEMMAP_TAIL_PAGE_REUSE		-1
+#define GFP_VMEMMAP_PAGE		\
+	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
 
 #ifndef VMEMMAP_HPAGE_SHIFT
 #define VMEMMAP_HPAGE_SHIFT		HPAGE_SHIFT
@@ -197,6 +200,11 @@ 
 	(__boundary - 1 < (end) - 1) ? __boundary : (end);		 \
 })
 
+typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
+					 unsigned long start, unsigned long end,
+					 void *priv);
+
+
 static inline unsigned int vmemmap_pages_per_hpage(struct hstate *h)
 {
 	return free_vmemmap_pages_per_hpage(h) + RESERVE_VMEMMAP_NR;
@@ -236,9 +244,39 @@  static pmd_t *vmemmap_to_pmd(unsigned long addr)
 	return pmd;
 }
 
+static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
+				      unsigned long start, unsigned long end,
+				      void *priv)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	void *from = page_to_virt(reuse);
+	unsigned long addr;
+	struct list_head *pages = priv;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		void *to;
+		struct page *page;
+
+		VM_BUG_ON(pte_none(*pte) || pte_page(*pte) != reuse);
+
+		page = list_first_entry(pages, struct page, lru);
+		list_del(&page->lru);
+		to = page_to_virt(page);
+		copy_page(to, from);
+
+		/*
+		 * Make sure that any data that writes to the @to is made
+		 * visible to the physical page.
+		 */
+		flush_kernel_vmap_range(to, PAGE_SIZE);
+
+		set_pte_at(&init_mm, addr, pte++, mk_pte(page, pgprot));
+	}
+}
+
 static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 				    unsigned long start, unsigned long end,
-				    struct list_head *vmemmap_pages)
+				    void *priv)
 {
 	/*
 	 * Make the tail pages are mapped with read-only to catch
@@ -247,6 +285,7 @@  static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 	pgprot_t pgprot = PAGE_KERNEL_RO;
 	pte_t entry = mk_pte(reuse, pgprot);
 	unsigned long addr;
+	struct list_head *pages = priv;
 
 	for (addr = start; addr < end; addr += PAGE_SIZE, pte++) {
 		struct page *page;
@@ -254,14 +293,14 @@  static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 		VM_BUG_ON(pte_none(*pte));
 
 		page = pte_page(*pte);
-		list_add(&page->lru, vmemmap_pages);
+		list_add(&page->lru, pages);
 
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 }
 
 static void vmemmap_remap_range(unsigned long start, unsigned long end,
-				struct list_head *vmemmap_pages)
+				vmemmap_remap_pte_func_t func, void *priv)
 {
 	pmd_t *pmd;
 	unsigned long next, addr = start;
@@ -281,12 +320,52 @@  static void vmemmap_remap_range(unsigned long start, unsigned long end,
 			reuse = pte_page(pte[VMEMMAP_TAIL_PAGE_REUSE]);
 
 		next = vmemmap_hpage_addr_end(addr, end);
-		vmemmap_reuse_pte_range(reuse, pte, addr, next, vmemmap_pages);
+		func(reuse, pte, addr, next, priv);
 	} while (pmd++, addr = next, addr != end);
 
 	flush_tlb_kernel_range(start, end);
 }
 
+static inline void alloc_vmemmap_pages(struct hstate *h, struct list_head *list)
+{
+	unsigned int nr = free_vmemmap_pages_per_hpage(h);
+
+	while (nr--) {
+		struct page *page;
+
+retry:
+		page = alloc_page(GFP_VMEMMAP_PAGE);
+		if (unlikely(!page)) {
+			msleep(100);
+			/*
+			 * We should retry infinitely, because we cannot
+			 * handle allocation failures. Once we allocate
+			 * vmemmap pages successfully, then we can free
+			 * a HugeTLB page.
+			 */
+			goto retry;
+		}
+		list_add_tail(&page->lru, list);
+	}
+}
+
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	unsigned long start, end;
+	unsigned long vmemmap_addr = (unsigned long)head;
+	LIST_HEAD(vmemmap_pages);
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	alloc_vmemmap_pages(h, &vmemmap_pages);
+
+	start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
+	end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
+	vmemmap_remap_range(start, end, vmemmap_restore_pte_range,
+			    &vmemmap_pages);
+}
+
 /*
  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  * allocator or buddy allocator. If the PG_reserved flag is set, it means
@@ -322,7 +401,7 @@  void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 
 	start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
 	end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
-	vmemmap_remap_range(start, end, &vmemmap_pages);
+	vmemmap_remap_range(start, end, vmemmap_reuse_pte_range, &vmemmap_pages);
 
 	free_vmemmap_page_list(&vmemmap_pages);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index bf22cd003acb..8fd57c49e230 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@ 
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,10 @@  static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 	return 0;
 }
 #else
+static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }