diff mbox series

[v6,01/15] mm: khugepaged: don't carry huge page to the next loop for !CONFIG_NUMA

Message ID 20220604004004.954674-2-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe June 4, 2022, 12:39 a.m. UTC
From: Yang Shi <shy828301@gmail.com>

The khugepaged has optimization to reduce huge page allocation calls for
!CONFIG_NUMA by carrying the allocated but failed to collapse huge page to
the next loop.  CONFIG_NUMA doesn't do so since the next loop may try to
collapse huge page from a different node, so it doesn't make too much sense
to carry it.

But when NUMA=n, the huge page is allocated by khugepaged_prealloc_page()
before scanning the address space, so it means huge page may be allocated
even though there is no suitable range for collapsing.  Then the page would
be just freed if khugepaged already made enough progress.  This could make
NUMA=n run have 5 times as much thp_collapse_alloc as NUMA=y run.  This
problem actually makes things worse due to the way more pointless THP
allocations and makes the optimization pointless.

This could be fixed by carrying the huge page across scans, but it will
complicate the code further and the huge page may be carried
indefinitely.  But if we take one step back,  the optimization itself seems
not worth keeping nowadays since:
  * Not too many users build NUMA=n kernel nowadays even though the kernel is
    actually running on a non-NUMA machine. Some small devices may run NUMA=n
    kernel, but I don't think they actually use THP.
  * Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be
    stored on the per-cpu lists"), THP could be cached by pcp.  This actually
    somehow does the job done by the optimization.

Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 100 ++++++++----------------------------------------
 1 file changed, 17 insertions(+), 83 deletions(-)

Comments

Yang Shi June 6, 2022, 6:25 p.m. UTC | #1
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> From: Yang Shi <shy828301@gmail.com>
>
> The khugepaged has optimization to reduce huge page allocation calls for
> !CONFIG_NUMA by carrying the allocated but failed to collapse huge page to
> the next loop.  CONFIG_NUMA doesn't do so since the next loop may try to
> collapse huge page from a different node, so it doesn't make too much sense
> to carry it.
>
> But when NUMA=n, the huge page is allocated by khugepaged_prealloc_page()
> before scanning the address space, so it means huge page may be allocated
> even though there is no suitable range for collapsing.  Then the page would
> be just freed if khugepaged already made enough progress.  This could make
> NUMA=n run have 5 times as much thp_collapse_alloc as NUMA=y run.  This
> problem actually makes things worse due to the way more pointless THP
> allocations and makes the optimization pointless.
>
> This could be fixed by carrying the huge page across scans, but it will
> complicate the code further and the huge page may be carried
> indefinitely.  But if we take one step back,  the optimization itself seems
> not worth keeping nowadays since:
>   * Not too many users build NUMA=n kernel nowadays even though the kernel is
>     actually running on a non-NUMA machine. Some small devices may run NUMA=n
>     kernel, but I don't think they actually use THP.
>   * Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be
>     stored on the per-cpu lists"), THP could be cached by pcp.  This actually
>     somehow does the job done by the optimization.
>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>

Thanks for bringing the patch into the series. You could have my SOB
on this patch.

> ---
>  mm/khugepaged.c | 100 ++++++++----------------------------------------
>  1 file changed, 17 insertions(+), 83 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 476d79360101..cc3d6fb446d5 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -833,29 +833,30 @@ static int khugepaged_find_target_node(void)
>         last_khugepaged_target_node = target_node;
>         return target_node;
>  }
> +#else
> +static int khugepaged_find_target_node(void)
> +{
> +       return 0;
> +}
> +#endif
>
> -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> +/* Sleep for the first alloc fail, break the loop for the second fail */
> +static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
>  {
>         if (IS_ERR(*hpage)) {
>                 if (!*wait)
> -                       return false;
> +                       return true;
>
>                 *wait = false;
>                 *hpage = NULL;
>                 khugepaged_alloc_sleep();
> -       } else if (*hpage) {
> -               put_page(*hpage);
> -               *hpage = NULL;
>         }
> -
> -       return true;
> +       return false;
>  }
>
>  static struct page *
>  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>  {
> -       VM_BUG_ON_PAGE(*hpage, *hpage);
> -
>         *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
>         if (unlikely(!*hpage)) {
>                 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> @@ -867,74 +868,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>         count_vm_event(THP_COLLAPSE_ALLOC);
>         return *hpage;
>  }
> -#else
> -static int khugepaged_find_target_node(void)
> -{
> -       return 0;
> -}
> -
> -static inline struct page *alloc_khugepaged_hugepage(void)
> -{
> -       struct page *page;
> -
> -       page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
> -                          HPAGE_PMD_ORDER);
> -       if (page)
> -               prep_transhuge_page(page);
> -       return page;
> -}
> -
> -static struct page *khugepaged_alloc_hugepage(bool *wait)
> -{
> -       struct page *hpage;
> -
> -       do {
> -               hpage = alloc_khugepaged_hugepage();
> -               if (!hpage) {
> -                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> -                       if (!*wait)
> -                               return NULL;
> -
> -                       *wait = false;
> -                       khugepaged_alloc_sleep();
> -               } else
> -                       count_vm_event(THP_COLLAPSE_ALLOC);
> -       } while (unlikely(!hpage) && likely(khugepaged_enabled()));
> -
> -       return hpage;
> -}
> -
> -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> -{
> -       /*
> -        * If the hpage allocated earlier was briefly exposed in page cache
> -        * before collapse_file() failed, it is possible that racing lookups
> -        * have not yet completed, and would then be unpleasantly surprised by
> -        * finding the hpage reused for the same mapping at a different offset.
> -        * Just release the previous allocation if there is any danger of that.
> -        */
> -       if (*hpage && page_count(*hpage) > 1) {
> -               put_page(*hpage);
> -               *hpage = NULL;
> -       }
> -
> -       if (!*hpage)
> -               *hpage = khugepaged_alloc_hugepage(wait);
> -
> -       if (unlikely(!*hpage))
> -               return false;
> -
> -       return true;
> -}
> -
> -static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> -{
> -       VM_BUG_ON(!*hpage);
> -
> -       return  *hpage;
> -}
> -#endif
>
>  /*
>   * If mmap_lock temporarily dropped, revalidate vma
> @@ -1188,8 +1121,10 @@ static void collapse_huge_page(struct mm_struct *mm,
>  out_up_write:
>         mmap_write_unlock(mm);
>  out_nolock:
> -       if (!IS_ERR_OR_NULL(*hpage))
> +       if (!IS_ERR_OR_NULL(*hpage)) {
>                 mem_cgroup_uncharge(page_folio(*hpage));
> +               put_page(*hpage);
> +       }
>         trace_mm_collapse_huge_page(mm, isolated, result);
>         return;
>  }
> @@ -1992,8 +1927,10 @@ static void collapse_file(struct mm_struct *mm,
>         unlock_page(new_page);
>  out:
>         VM_BUG_ON(!list_empty(&pagelist));
> -       if (!IS_ERR_OR_NULL(*hpage))
> +       if (!IS_ERR_OR_NULL(*hpage)) {
>                 mem_cgroup_uncharge(page_folio(*hpage));
> +               put_page(*hpage);
> +       }
>         /* TODO: tracepoints */
>  }
>
> @@ -2243,7 +2180,7 @@ static void khugepaged_do_scan(void)
>         lru_add_drain_all();
>
>         while (progress < pages) {
> -               if (!khugepaged_prealloc_page(&hpage, &wait))
> +               if (alloc_fail_should_sleep(&hpage, &wait))
>                         break;
>
>                 cond_resched();
> @@ -2262,9 +2199,6 @@ static void khugepaged_do_scan(void)
>                         progress = pages;
>                 spin_unlock(&khugepaged_mm_lock);
>         }
> -
> -       if (!IS_ERR_OR_NULL(hpage))
> -               put_page(hpage);
>  }
>
>  static bool khugepaged_should_wakeup(void)
> --
> 2.36.1.255.ge46751e96f-goog
>
Peter Xu June 29, 2022, 8:49 p.m. UTC | #2
On Fri, Jun 03, 2022 at 05:39:50PM -0700, Zach O'Keefe wrote:
> -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> +/* Sleep for the first alloc fail, break the loop for the second fail */
> +static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
>  {
>  	if (IS_ERR(*hpage)) {
>  		if (!*wait)
> -			return false;
> +			return true;
>  
>  		*wait = false;
>  		*hpage = NULL;
>  		khugepaged_alloc_sleep();
> -	} else if (*hpage) {
> -		put_page(*hpage);
> -		*hpage = NULL;
>  	}
> -
> -	return true;
> +	return false;
>  }

One nitpick here:

It's weird to me to sleep in a function called XXX_should_sleep(), we'd
normally expect to sleep only if it returns true.

Meanwhile, would this be a very good chance to unwrap this function already
to remove the "bool*" reference, which looks not pretty?  Something like:

---8<---
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..807c10cd0816 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2235,9 +2235,6 @@ static void khugepaged_do_scan(void)
        lru_add_drain_all();
 
        while (progress < pages) {
-               if (!khugepaged_prealloc_page(&hpage, &wait))
-                       break;
-
                cond_resched();
 
                if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2253,6 +2250,18 @@ static void khugepaged_do_scan(void)
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
+
+               if (IS_ERR(*hpage)) {
+                       /*
+                        * If fail to allocate the first time, try to sleep
+                        * for a while.  When hit again, cancel the scan.
+                        */
+                       if (!wait)
+                               break;
+                       wait = false;
+                       *hpage = NULL;
+                       khugepaged_alloc_sleep();
+               }
        }
---8<---

Would this look slightly better?

Thanks,
Zach O'Keefe June 30, 2022, 1:15 a.m. UTC | #3
On Jun 29 16:49, Peter Xu wrote:
> On Fri, Jun 03, 2022 at 05:39:50PM -0700, Zach O'Keefe wrote:
> > -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> > +/* Sleep for the first alloc fail, break the loop for the second fail */
> > +static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
> >  {
> >  	if (IS_ERR(*hpage)) {
> >  		if (!*wait)
> > -			return false;
> > +			return true;
> >  
> >  		*wait = false;
> >  		*hpage = NULL;
> >  		khugepaged_alloc_sleep();
> > -	} else if (*hpage) {
> > -		put_page(*hpage);
> > -		*hpage = NULL;
> >  	}
> > -
> > -	return true;
> > +	return false;
> >  }
> 
> One nitpick here:
> 
> It's weird to me to sleep in a function called XXX_should_sleep(), we'd
> normally expect to sleep only if it returns true.
> 
> Meanwhile, would this be a very good chance to unwrap this function already
> to remove the "bool*" reference, which looks not pretty?  Something like:
> 
> ---8<---
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 16be62d493cd..807c10cd0816 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2235,9 +2235,6 @@ static void khugepaged_do_scan(void)
>         lru_add_drain_all();
>  
>         while (progress < pages) {
> -               if (!khugepaged_prealloc_page(&hpage, &wait))
> -                       break;
> -
>                 cond_resched();
>  
>                 if (unlikely(kthread_should_stop() || try_to_freeze()))
> @@ -2253,6 +2250,18 @@ static void khugepaged_do_scan(void)
>                 else
>                         progress = pages;
>                 spin_unlock(&khugepaged_mm_lock);
> +
> +               if (IS_ERR(*hpage)) {
> +                       /*
> +                        * If fail to allocate the first time, try to sleep
> +                        * for a while.  When hit again, cancel the scan.
> +                        */
> +                       if (!wait)
> +                               break;
> +                       wait = false;
> +                       *hpage = NULL;
> +                       khugepaged_alloc_sleep();
> +               }
>         }
> ---8<---
> 
> Would this look slightly better?

Hey Peter,

Thanks for taking the time to review. I think open coding this looks good. One
small detail is that if we move this to the end of the loop, we'll need to check
that progress < pages still before sleeping - else we run the risk of doing an
alloc sleep and a scan sleep.

But I'll let Yang make the call since it's his patch - he's just been kind
enough to donate it for this cause :)

Thanks,
Zach

> Thanks,
> 
> -- 
> Peter Xu
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 476d79360101..cc3d6fb446d5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -833,29 +833,30 @@  static int khugepaged_find_target_node(void)
 	last_khugepaged_target_node = target_node;
 	return target_node;
 }
+#else
+static int khugepaged_find_target_node(void)
+{
+	return 0;
+}
+#endif
 
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+/* Sleep for the first alloc fail, break the loop for the second fail */
+static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
 {
 	if (IS_ERR(*hpage)) {
 		if (!*wait)
-			return false;
+			return true;
 
 		*wait = false;
 		*hpage = NULL;
 		khugepaged_alloc_sleep();
-	} else if (*hpage) {
-		put_page(*hpage);
-		*hpage = NULL;
 	}
-
-	return true;
+	return false;
 }
 
 static struct page *
 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 {
-	VM_BUG_ON_PAGE(*hpage, *hpage);
-
 	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
@@ -867,74 +868,6 @@  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return *hpage;
 }
-#else
-static int khugepaged_find_target_node(void)
-{
-	return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
-	struct page *page;
-
-	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
-			   HPAGE_PMD_ORDER);
-	if (page)
-		prep_transhuge_page(page);
-	return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_khugepaged_hugepage();
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
-	/*
-	 * If the hpage allocated earlier was briefly exposed in page cache
-	 * before collapse_file() failed, it is possible that racing lookups
-	 * have not yet completed, and would then be unpleasantly surprised by
-	 * finding the hpage reused for the same mapping at a different offset.
-	 * Just release the previous allocation if there is any danger of that.
-	 */
-	if (*hpage && page_count(*hpage) > 1) {
-		put_page(*hpage);
-		*hpage = NULL;
-	}
-
-	if (!*hpage)
-		*hpage = khugepaged_alloc_hugepage(wait);
-
-	if (unlikely(!*hpage))
-		return false;
-
-	return true;
-}
-
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
-	VM_BUG_ON(!*hpage);
-
-	return  *hpage;
-}
-#endif
 
 /*
  * If mmap_lock temporarily dropped, revalidate vma
@@ -1188,8 +1121,10 @@  static void collapse_huge_page(struct mm_struct *mm,
 out_up_write:
 	mmap_write_unlock(mm);
 out_nolock:
-	if (!IS_ERR_OR_NULL(*hpage))
+	if (!IS_ERR_OR_NULL(*hpage)) {
 		mem_cgroup_uncharge(page_folio(*hpage));
+		put_page(*hpage);
+	}
 	trace_mm_collapse_huge_page(mm, isolated, result);
 	return;
 }
@@ -1992,8 +1927,10 @@  static void collapse_file(struct mm_struct *mm,
 	unlock_page(new_page);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	if (!IS_ERR_OR_NULL(*hpage))
+	if (!IS_ERR_OR_NULL(*hpage)) {
 		mem_cgroup_uncharge(page_folio(*hpage));
+		put_page(*hpage);
+	}
 	/* TODO: tracepoints */
 }
 
@@ -2243,7 +2180,7 @@  static void khugepaged_do_scan(void)
 	lru_add_drain_all();
 
 	while (progress < pages) {
-		if (!khugepaged_prealloc_page(&hpage, &wait))
+		if (alloc_fail_should_sleep(&hpage, &wait))
 			break;
 
 		cond_resched();
@@ -2262,9 +2199,6 @@  static void khugepaged_do_scan(void)
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
-
-	if (!IS_ERR_OR_NULL(hpage))
-		put_page(hpage);
 }
 
 static bool khugepaged_should_wakeup(void)