diff mbox series

[v6,06/15] mm/khugepaged: pipe enum scan_result codes back to callers

Message ID 20220604004004.954674-7-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe June 4, 2022, 12:39 a.m. UTC
Pipe enum scan_result codes back through return values of functions
downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to
inform callers if the operation was successful, and if not, why.

Since khugepaged_scan_pmd()'s return value already has a specific
meaning (whether mmap_lock was unlocked or not), add a bool* argument
to khugepaged_scan_pmd() to retrieve this information.

Change khugepaged to take action based on the return values of
khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting
deep within the collapsing functions themselves.

Remove dependency on error pointers to communicate to khugepaged that
allocation failed and it should sleep; instead just use the result of
the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails).

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 96 deletions(-)

Comments

Yang Shi June 6, 2022, 10:39 p.m. UTC | #1
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Pipe enum scan_result codes back through return values of functions
> downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to
> inform callers if the operation was successful, and if not, why.
>
> Since khugepaged_scan_pmd()'s return value already has a specific
> meaning (whether mmap_lock was unlocked or not), add a bool* argument
> to khugepaged_scan_pmd() to retrieve this information.
>
> Change khugepaged to take action based on the return values of
> khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting
> deep within the collapsing functions themselves.
>
> Remove dependency on error pointers to communicate to khugepaged that
> allocation failed and it should sleep; instead just use the result of
> the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails).
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>

Reviewed-by: Yang Shi <shy828301@gmail.com>

A couple of minor nits below...

> ---
>  mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------
>  1 file changed, 96 insertions(+), 96 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index ba722347bebd..03e0da0008f1 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 result = SCAN_SUCCEED;
>                 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
>                                                     referenced, writable, result);
> -               return 1;
> +               return SCAN_SUCCEED;

You could do "return result" too.

>         }
>  out:
>         release_pte_pages(pte, _pte, compound_pagelist);
>         trace_mm_collapse_huge_page_isolate(page, none_or_zero,
>                                             referenced, writable, result);
> -       return 0;
> +       return result;
>  }
>
>  static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> @@ -850,14 +850,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc)
>  #endif
>
>  /* Sleep for the first alloc fail, break the loop for the second fail */
> -static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
> +static bool alloc_fail_should_sleep(int result, bool *wait)
>  {
> -       if (IS_ERR(*hpage)) {
> +       if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
>                 if (!*wait)
>                         return true;
>
>                 *wait = false;
> -               *hpage = NULL;
>                 khugepaged_alloc_sleep();
>         }
>         return false;
> @@ -868,7 +867,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>         *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
>         if (unlikely(!*hpage)) {
>                 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> -               *hpage = ERR_PTR(-ENOMEM);
>                 return false;
>         }
>
> @@ -1010,17 +1008,17 @@ static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage,
>         return SCAN_SUCCEED;
>  }
>
> -static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> -                              struct page **hpage, int referenced,
> -                              int unmapped, struct collapse_control *cc)
> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> +                             int referenced, int unmapped,
> +                             struct collapse_control *cc)
>  {
>         LIST_HEAD(compound_pagelist);
>         pmd_t *pmd, _pmd;
>         pte_t *pte;
>         pgtable_t pgtable;
> -       struct page *new_page;
> +       struct page *hpage;
>         spinlock_t *pmd_ptl, *pte_ptl;
> -       int isolated = 0, result = 0;
> +       int result = SCAN_FAIL;
>         struct vm_area_struct *vma;
>         struct mmu_notifier_range range;
>
> @@ -1034,12 +1032,10 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
>          */
>         mmap_read_unlock(mm);
>
> -       result = alloc_charge_hpage(mm, hpage, cc);
> +       result = alloc_charge_hpage(mm, &hpage, cc);
>         if (result != SCAN_SUCCEED)
>                 goto out_nolock;
>
> -       new_page = *hpage;
> -
>         mmap_read_lock(mm);
>         result = hugepage_vma_revalidate(mm, address, &vma);
>         if (result) {
> @@ -1100,11 +1096,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
>         mmu_notifier_invalidate_range_end(&range);
>
>         spin_lock(pte_ptl);
> -       isolated = __collapse_huge_page_isolate(vma, address, pte,
> -                       &compound_pagelist);
> +       result =  __collapse_huge_page_isolate(vma, address, pte,
> +                                              &compound_pagelist);
>         spin_unlock(pte_ptl);
>
> -       if (unlikely(!isolated)) {
> +       if (unlikely(result != SCAN_SUCCEED)) {
>                 pte_unmap(pte);
>                 spin_lock(pmd_ptl);
>                 BUG_ON(!pmd_none(*pmd));
> @@ -1116,7 +1112,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
>                 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>                 spin_unlock(pmd_ptl);
>                 anon_vma_unlock_write(vma->anon_vma);
> -               result = SCAN_FAIL;
>                 goto out_up_write;
>         }
>
> @@ -1126,8 +1121,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
>          */
>         anon_vma_unlock_write(vma->anon_vma);
>
> -       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
> -                       &compound_pagelist);
> +       __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
> +                                 &compound_pagelist);
>         pte_unmap(pte);
>         /*
>          * spin_lock() below is not the equivalent of smp_wmb(), but
> @@ -1135,43 +1130,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
>          * avoid the copy_huge_page writes to become visible after
>          * the set_pmd_at() write.
>          */
> -       __SetPageUptodate(new_page);
> +       __SetPageUptodate(hpage);
>         pgtable = pmd_pgtable(_pmd);
>
> -       _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
> +       _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
>         _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
>
>         spin_lock(pmd_ptl);
>         BUG_ON(!pmd_none(*pmd));
> -       page_add_new_anon_rmap(new_page, vma, address);
> -       lru_cache_add_inactive_or_unevictable(new_page, vma);
> +       page_add_new_anon_rmap(hpage, vma, address);
> +       lru_cache_add_inactive_or_unevictable(hpage, vma);
>         pgtable_trans_huge_deposit(mm, pmd, pgtable);
>         set_pmd_at(mm, address, pmd, _pmd);
>         update_mmu_cache_pmd(vma, address, pmd);
>         spin_unlock(pmd_ptl);
>
> -       *hpage = NULL;
> +       hpage = NULL;
>
> -       khugepaged_pages_collapsed++;
>         result = SCAN_SUCCEED;
>  out_up_write:
>         mmap_write_unlock(mm);
>  out_nolock:
> -       if (!IS_ERR_OR_NULL(*hpage)) {
> -               mem_cgroup_uncharge(page_folio(*hpage));
> -               put_page(*hpage);
> +       if (hpage) {
> +               mem_cgroup_uncharge(page_folio(hpage));
> +               put_page(hpage);
>         }
> -       trace_mm_collapse_huge_page(mm, isolated, result);
> -       return;
> +       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> +       return result;
>  }
>
>  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
> -                              unsigned long address, struct page **hpage,
> +                              unsigned long address, bool *mmap_locked,
>                                struct collapse_control *cc)
>  {
>         pmd_t *pmd;
>         pte_t *pte, *_pte;
> -       int ret = 0, result = 0, referenced = 0;
> +       int result = SCAN_FAIL, referenced = 0;
>         int none_or_zero = 0, shared = 0;
>         struct page *page = NULL;
>         unsigned long _address;
> @@ -1306,19 +1300,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
>                 result = SCAN_LACK_REFERENCED_PAGE;
>         } else {
>                 result = SCAN_SUCCEED;
> -               ret = 1;
>         }
>  out_unmap:
>         pte_unmap_unlock(pte, ptl);
> -       if (ret) {
> +       if (result == SCAN_SUCCEED) {
>                 /* collapse_huge_page will return with the mmap_lock released */
> -               collapse_huge_page(mm, address, hpage, referenced, unmapped,
> -                                  cc);
> +               *mmap_locked = false;
> +               result = collapse_huge_page(mm, address, referenced,
> +                                           unmapped, cc);

Shall move "*mmap_locked = false" after collapse_huge_page() call? No
functional change, but seems more consistent.

>         }
>  out:
>         trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
>                                      none_or_zero, result, unmapped);
> -       return ret;
> +       return result;
>  }
>
>  static void collect_mm_slot(struct mm_slot *mm_slot)
> @@ -1581,7 +1575,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   * @mm: process address space where collapse happens
>   * @file: file that collapse on
>   * @start: collapse start address
> - * @hpage: new allocated huge page for collapse
>   * @cc: collapse context and scratchpad
>   *
>   * Basic scheme is simple, details are more complex:
> @@ -1599,12 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   *    + restore gaps in the page cache;
>   *    + unlock and free huge page;
>   */
> -static void collapse_file(struct mm_struct *mm, struct file *file,
> -                         pgoff_t start, struct page **hpage,
> -                         struct collapse_control *cc)
> +static int collapse_file(struct mm_struct *mm, struct file *file,
> +                        pgoff_t start, struct collapse_control *cc)
>  {
>         struct address_space *mapping = file->f_mapping;
> -       struct page *new_page;
> +       struct page *hpage;
>         pgoff_t index, end = start + HPAGE_PMD_NR;
>         LIST_HEAD(pagelist);
>         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> @@ -1615,12 +1607,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
>         VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>
> -       result = alloc_charge_hpage(mm, hpage, cc);
> +       result = alloc_charge_hpage(mm, &hpage, cc);
>         if (result != SCAN_SUCCEED)
>                 goto out;
>
> -       new_page = *hpage;
> -
>         /*
>          * Ensure we have slots for all the pages in the range.  This is
>          * almost certainly a no-op because most of the pages must be present
> @@ -1637,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                 }
>         } while (1);
>
> -       __SetPageLocked(new_page);
> +       __SetPageLocked(hpage);
>         if (is_shmem)
> -               __SetPageSwapBacked(new_page);
> -       new_page->index = start;
> -       new_page->mapping = mapping;
> +               __SetPageSwapBacked(hpage);
> +       hpage->index = start;
> +       hpage->mapping = mapping;
>
>         /*
> -        * At this point the new_page is locked and not up-to-date.
> +        * At this point the hpage is locked and not up-to-date.
>          * It's safe to insert it into the page cache, because nobody would
>          * be able to map it or use it in another way until we unlock it.
>          */
> @@ -1672,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                                         result = SCAN_FAIL;
>                                         goto xa_locked;
>                                 }
> -                               xas_store(&xas, new_page);
> +                               xas_store(&xas, hpage);
>                                 nr_none++;
>                                 continue;
>                         }
> @@ -1814,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                 list_add_tail(&page->lru, &pagelist);
>
>                 /* Finally, replace with the new page. */
> -               xas_store(&xas, new_page);
> +               xas_store(&xas, hpage);
>                 continue;
>  out_unlock:
>                 unlock_page(page);
>                 put_page(page);
>                 goto xa_unlocked;
>         }
> -       nr = thp_nr_pages(new_page);
> +       nr = thp_nr_pages(hpage);
>
>         if (is_shmem)
> -               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> +               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
>         else {
> -               __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
> +               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
>                 filemap_nr_thps_inc(mapping);
>                 /*
>                  * Paired with smp_mb() in do_dentry_open() to ensure
> @@ -1837,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                 smp_mb();
>                 if (inode_is_open_for_write(mapping->host)) {
>                         result = SCAN_FAIL;
> -                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
> +                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
>                         filemap_nr_thps_dec(mapping);
>                         goto xa_locked;
>                 }
>         }
>
>         if (nr_none) {
> -               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> +               __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
>                 if (is_shmem)
> -                       __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> +                       __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
>         }
>
>         /* Join all the small entries into a single multi-index entry */
>         xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -       xas_store(&xas, new_page);
> +       xas_store(&xas, hpage);
>  xa_locked:
>         xas_unlock_irq(&xas);
>  xa_unlocked:
> @@ -1873,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                 index = start;
>                 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
>                         while (index < page->index) {
> -                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
> +                               clear_highpage(hpage + (index % HPAGE_PMD_NR));
>                                 index++;
>                         }
> -                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> -                                       page);
> +                       copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
> +                                     page);
>                         list_del(&page->lru);
>                         page->mapping = NULL;
>                         page_ref_unfreeze(page, 1);
> @@ -1888,23 +1878,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                         index++;
>                 }
>                 while (index < end) {
> -                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
> +                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
>                         index++;
>                 }
>
> -               SetPageUptodate(new_page);
> -               page_ref_add(new_page, HPAGE_PMD_NR - 1);
> +               SetPageUptodate(hpage);
> +               page_ref_add(hpage, HPAGE_PMD_NR - 1);
>                 if (is_shmem)
> -                       set_page_dirty(new_page);
> -               lru_cache_add(new_page);
> +                       set_page_dirty(hpage);
> +               lru_cache_add(hpage);
>
>                 /*
>                  * Remove pte page tables, so we can re-fault the page as huge.
>                  */
>                 retract_page_tables(mapping, start);
> -               *hpage = NULL;
> -
> -               khugepaged_pages_collapsed++;
> +               unlock_page(hpage);
> +               hpage = NULL;
> +               goto out;

Maybe just set hpage to NULL here, then later...

>         } else {
>                 struct page *page;
>
> @@ -1943,22 +1933,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
>                 VM_BUG_ON(nr_none);
>                 xas_unlock_irq(&xas);
>
> -               new_page->mapping = NULL;
> +               hpage->mapping = NULL;
>         }
>
> -       unlock_page(new_page);
> +       unlock_page(hpage);

do
if (hpage)
    unlock_page(hpage);

>  out:
>         VM_BUG_ON(!list_empty(&pagelist));
> -       if (!IS_ERR_OR_NULL(*hpage)) {
> -               mem_cgroup_uncharge(page_folio(*hpage));
> -               put_page(*hpage);
> +       if (hpage) {
> +               mem_cgroup_uncharge(page_folio(hpage));
> +               put_page(hpage);
>         }
>         /* TODO: tracepoints */
> +       return result;
>  }
>
> -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> -                                pgoff_t start, struct page **hpage,
> -                                struct collapse_control *cc)
> +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> +                               pgoff_t start, struct collapse_control *cc)
>  {
>         struct page *page = NULL;
>         struct address_space *mapping = file->f_mapping;
> @@ -2031,15 +2021,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
>                         result = SCAN_EXCEED_NONE_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>                 } else {
> -                       collapse_file(mm, file, start, hpage, cc);
> +                       result = collapse_file(mm, file, start, cc);
>                 }
>         }
>
>         /* TODO: tracepoints */
> +       return result;
>  }
>  #else
> -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> -                                pgoff_t start, struct collapse_control *cc)
> +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start,
> +                               struct collapse_control *cc)
>  {
>         BUILD_BUG();
>  }
> @@ -2049,8 +2040,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
>  }
>  #endif
>
> -static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> -                                           struct page **hpage,
> +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>                                             struct collapse_control *cc)
>         __releases(&khugepaged_mm_lock)
>         __acquires(&khugepaged_mm_lock)
> @@ -2064,6 +2054,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
>
>         VM_BUG_ON(!pages);
>         lockdep_assert_held(&khugepaged_mm_lock);
> +       *result = SCAN_FAIL;
>
>         if (khugepaged_scan.mm_slot)
>                 mm_slot = khugepaged_scan.mm_slot;
> @@ -2117,7 +2108,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
>                         goto skip;
>
>                 while (khugepaged_scan.address < hend) {
> -                       int ret;
> +                       bool mmap_locked = true;
> +
>                         cond_resched();
>                         if (unlikely(khugepaged_test_exit(mm)))
>                                 goto breakouterloop;
> @@ -2134,20 +2126,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
>                                                 khugepaged_scan.address);
>
>                                 mmap_read_unlock(mm);
> -                               ret = 1;
> -                               khugepaged_scan_file(mm, file, pgoff, hpage,
> -                                                    cc);
> +                               mmap_locked = false;
> +                               *result = khugepaged_scan_file(mm, file, pgoff,
> +                                                              cc);
>                                 fput(file);
>                         } else {
> -                               ret = khugepaged_scan_pmd(mm, vma,
> -                                               khugepaged_scan.address,
> -                                               hpage, cc);
> +                               *result = khugepaged_scan_pmd(mm, vma,
> +                                                             khugepaged_scan.address,
> +                                                             &mmap_locked, cc);
>                         }
> +                       if (*result == SCAN_SUCCEED)
> +                               ++khugepaged_pages_collapsed;
>                         /* move to next address */
>                         khugepaged_scan.address += HPAGE_PMD_SIZE;
>                         progress += HPAGE_PMD_NR;
> -                       if (ret)
> -                               /* we released mmap_lock so break loop */
> +                       if (!mmap_locked)
> +                               /*
> +                                * We released mmap_lock so break loop.  Note
> +                                * that we drop mmap_lock before all hugepage
> +                                * allocations, so if allocation fails, we are
> +                                * guaranteed to break here and report the
> +                                * correct result back to caller.
> +                                */
>                                 goto breakouterloop_mmap_lock;
>                         if (progress >= pages)
>                                 goto breakouterloop;
> @@ -2199,15 +2199,15 @@ static int khugepaged_wait_event(void)
>
>  static void khugepaged_do_scan(struct collapse_control *cc)
>  {
> -       struct page *hpage = NULL;
>         unsigned int progress = 0, pass_through_head = 0;
>         unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
>         bool wait = true;
> +       int result = SCAN_SUCCEED;
>
>         lru_add_drain_all();
>
>         while (progress < pages) {
> -               if (alloc_fail_should_sleep(&hpage, &wait))
> +               if (alloc_fail_should_sleep(result, &wait))
>                         break;
>
>                 cond_resched();
> @@ -2221,7 +2221,7 @@ static void khugepaged_do_scan(struct collapse_control *cc)
>                 if (khugepaged_has_work() &&
>                     pass_through_head < 2)
>                         progress += khugepaged_scan_mm_slot(pages - progress,
> -                                                           &hpage, cc);
> +                                                           &result, cc);
>                 else
>                         progress = pages;
>                 spin_unlock(&khugepaged_mm_lock);
> --
> 2.36.1.255.ge46751e96f-goog
>
Zach O'Keefe June 7, 2022, 12:17 a.m. UTC | #2
On Mon, Jun 6, 2022 at 3:40 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > Pipe enum scan_result codes back through return values of functions
> > downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to
> > inform callers if the operation was successful, and if not, why.
> >
> > Since khugepaged_scan_pmd()'s return value already has a specific
> > meaning (whether mmap_lock was unlocked or not), add a bool* argument
> > to khugepaged_scan_pmd() to retrieve this information.
> >
> > Change khugepaged to take action based on the return values of
> > khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting
> > deep within the collapsing functions themselves.
> >
> > Remove dependency on error pointers to communicate to khugepaged that
> > allocation failed and it should sleep; instead just use the result of
> > the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails).
> >
> > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
>
> Reviewed-by: Yang Shi <shy828301@gmail.com>

Hey Yang,

Thanks for taking the time to review this series. Very much appreciated!

> A couple of minor nits below...
>
> > ---
> >  mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------
> >  1 file changed, 96 insertions(+), 96 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index ba722347bebd..03e0da0008f1 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                 result = SCAN_SUCCEED;
> >                 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> >                                                     referenced, writable, result);
> > -               return 1;
> > +               return SCAN_SUCCEED;
>
> You could do "return result" too.

Thanks for catching. Done.

> >         }
> >  out:
> >         release_pte_pages(pte, _pte, compound_pagelist);
> >         trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> >                                             referenced, writable, result);
> > -       return 0;
> > +       return result;
> >  }
> >
> >  static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> > @@ -850,14 +850,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc)
> >  #endif
> >
> >  /* Sleep for the first alloc fail, break the loop for the second fail */
> > -static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
> > +static bool alloc_fail_should_sleep(int result, bool *wait)
> >  {
> > -       if (IS_ERR(*hpage)) {
> > +       if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
> >                 if (!*wait)
> >                         return true;
> >
> >                 *wait = false;
> > -               *hpage = NULL;
> >                 khugepaged_alloc_sleep();
> >         }
> >         return false;
> > @@ -868,7 +867,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> >         *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
> >         if (unlikely(!*hpage)) {
> >                 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> > -               *hpage = ERR_PTR(-ENOMEM);
> >                 return false;
> >         }
> >
> > @@ -1010,17 +1008,17 @@ static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage,
> >         return SCAN_SUCCEED;
> >  }
> >
> > -static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > -                              struct page **hpage, int referenced,
> > -                              int unmapped, struct collapse_control *cc)
> > +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > +                             int referenced, int unmapped,
> > +                             struct collapse_control *cc)
> >  {
> >         LIST_HEAD(compound_pagelist);
> >         pmd_t *pmd, _pmd;
> >         pte_t *pte;
> >         pgtable_t pgtable;
> > -       struct page *new_page;
> > +       struct page *hpage;
> >         spinlock_t *pmd_ptl, *pte_ptl;
> > -       int isolated = 0, result = 0;
> > +       int result = SCAN_FAIL;
> >         struct vm_area_struct *vma;
> >         struct mmu_notifier_range range;
> >
> > @@ -1034,12 +1032,10 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >          */
> >         mmap_read_unlock(mm);
> >
> > -       result = alloc_charge_hpage(mm, hpage, cc);
> > +       result = alloc_charge_hpage(mm, &hpage, cc);
> >         if (result != SCAN_SUCCEED)
> >                 goto out_nolock;
> >
> > -       new_page = *hpage;
> > -
> >         mmap_read_lock(mm);
> >         result = hugepage_vma_revalidate(mm, address, &vma);
> >         if (result) {
> > @@ -1100,11 +1096,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >         mmu_notifier_invalidate_range_end(&range);
> >
> >         spin_lock(pte_ptl);
> > -       isolated = __collapse_huge_page_isolate(vma, address, pte,
> > -                       &compound_pagelist);
> > +       result =  __collapse_huge_page_isolate(vma, address, pte,
> > +                                              &compound_pagelist);
> >         spin_unlock(pte_ptl);
> >
> > -       if (unlikely(!isolated)) {
> > +       if (unlikely(result != SCAN_SUCCEED)) {
> >                 pte_unmap(pte);
> >                 spin_lock(pmd_ptl);
> >                 BUG_ON(!pmd_none(*pmd));
> > @@ -1116,7 +1112,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >                 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> >                 spin_unlock(pmd_ptl);
> >                 anon_vma_unlock_write(vma->anon_vma);
> > -               result = SCAN_FAIL;
> >                 goto out_up_write;
> >         }
> >
> > @@ -1126,8 +1121,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >          */
> >         anon_vma_unlock_write(vma->anon_vma);
> >
> > -       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
> > -                       &compound_pagelist);
> > +       __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
> > +                                 &compound_pagelist);
> >         pte_unmap(pte);
> >         /*
> >          * spin_lock() below is not the equivalent of smp_wmb(), but
> > @@ -1135,43 +1130,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >          * avoid the copy_huge_page writes to become visible after
> >          * the set_pmd_at() write.
> >          */
> > -       __SetPageUptodate(new_page);
> > +       __SetPageUptodate(hpage);
> >         pgtable = pmd_pgtable(_pmd);
> >
> > -       _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
> > +       _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
> >         _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> >
> >         spin_lock(pmd_ptl);
> >         BUG_ON(!pmd_none(*pmd));
> > -       page_add_new_anon_rmap(new_page, vma, address);
> > -       lru_cache_add_inactive_or_unevictable(new_page, vma);
> > +       page_add_new_anon_rmap(hpage, vma, address);
> > +       lru_cache_add_inactive_or_unevictable(hpage, vma);
> >         pgtable_trans_huge_deposit(mm, pmd, pgtable);
> >         set_pmd_at(mm, address, pmd, _pmd);
> >         update_mmu_cache_pmd(vma, address, pmd);
> >         spin_unlock(pmd_ptl);
> >
> > -       *hpage = NULL;
> > +       hpage = NULL;
> >
> > -       khugepaged_pages_collapsed++;
> >         result = SCAN_SUCCEED;
> >  out_up_write:
> >         mmap_write_unlock(mm);
> >  out_nolock:
> > -       if (!IS_ERR_OR_NULL(*hpage)) {
> > -               mem_cgroup_uncharge(page_folio(*hpage));
> > -               put_page(*hpage);
> > +       if (hpage) {
> > +               mem_cgroup_uncharge(page_folio(hpage));
> > +               put_page(hpage);
> >         }
> > -       trace_mm_collapse_huge_page(mm, isolated, result);
> > -       return;
> > +       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > +       return result;
> >  }
> >
> >  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
> > -                              unsigned long address, struct page **hpage,
> > +                              unsigned long address, bool *mmap_locked,
> >                                struct collapse_control *cc)
> >  {
> >         pmd_t *pmd;
> >         pte_t *pte, *_pte;
> > -       int ret = 0, result = 0, referenced = 0;
> > +       int result = SCAN_FAIL, referenced = 0;
> >         int none_or_zero = 0, shared = 0;
> >         struct page *page = NULL;
> >         unsigned long _address;
> > @@ -1306,19 +1300,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
> >                 result = SCAN_LACK_REFERENCED_PAGE;
> >         } else {
> >                 result = SCAN_SUCCEED;
> > -               ret = 1;
> >         }
> >  out_unmap:
> >         pte_unmap_unlock(pte, ptl);
> > -       if (ret) {
> > +       if (result == SCAN_SUCCEED) {
> >                 /* collapse_huge_page will return with the mmap_lock released */
> > -               collapse_huge_page(mm, address, hpage, referenced, unmapped,
> > -                                  cc);
> > +               *mmap_locked = false;
> > +               result = collapse_huge_page(mm, address, referenced,
> > +                                           unmapped, cc);
>
> Shall move "*mmap_locked = false" after collapse_huge_page() call? No
> functional change, but seems more consistent.

Done.

> >         }
> >  out:
> >         trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> >                                      none_or_zero, result, unmapped);
> > -       return ret;
> > +       return result;
> >  }
> >
> >  static void collect_mm_slot(struct mm_slot *mm_slot)
> > @@ -1581,7 +1575,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >   * @mm: process address space where collapse happens
> >   * @file: file that collapse on
> >   * @start: collapse start address
> > - * @hpage: new allocated huge page for collapse
> >   * @cc: collapse context and scratchpad
> >   *
> >   * Basic scheme is simple, details are more complex:
> > @@ -1599,12 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >   *    + restore gaps in the page cache;
> >   *    + unlock and free huge page;
> >   */
> > -static void collapse_file(struct mm_struct *mm, struct file *file,
> > -                         pgoff_t start, struct page **hpage,
> > -                         struct collapse_control *cc)
> > +static int collapse_file(struct mm_struct *mm, struct file *file,
> > +                        pgoff_t start, struct collapse_control *cc)
> >  {
> >         struct address_space *mapping = file->f_mapping;
> > -       struct page *new_page;
> > +       struct page *hpage;
> >         pgoff_t index, end = start + HPAGE_PMD_NR;
> >         LIST_HEAD(pagelist);
> >         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> > @@ -1615,12 +1607,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> >         VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
> >
> > -       result = alloc_charge_hpage(mm, hpage, cc);
> > +       result = alloc_charge_hpage(mm, &hpage, cc);
> >         if (result != SCAN_SUCCEED)
> >                 goto out;
> >
> > -       new_page = *hpage;
> > -
> >         /*
> >          * Ensure we have slots for all the pages in the range.  This is
> >          * almost certainly a no-op because most of the pages must be present
> > @@ -1637,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                 }
> >         } while (1);
> >
> > -       __SetPageLocked(new_page);
> > +       __SetPageLocked(hpage);
> >         if (is_shmem)
> > -               __SetPageSwapBacked(new_page);
> > -       new_page->index = start;
> > -       new_page->mapping = mapping;
> > +               __SetPageSwapBacked(hpage);
> > +       hpage->index = start;
> > +       hpage->mapping = mapping;
> >
> >         /*
> > -        * At this point the new_page is locked and not up-to-date.
> > +        * At this point the hpage is locked and not up-to-date.
> >          * It's safe to insert it into the page cache, because nobody would
> >          * be able to map it or use it in another way until we unlock it.
> >          */
> > @@ -1672,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                                         result = SCAN_FAIL;
> >                                         goto xa_locked;
> >                                 }
> > -                               xas_store(&xas, new_page);
> > +                               xas_store(&xas, hpage);
> >                                 nr_none++;
> >                                 continue;
> >                         }
> > @@ -1814,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                 list_add_tail(&page->lru, &pagelist);
> >
> >                 /* Finally, replace with the new page. */
> > -               xas_store(&xas, new_page);
> > +               xas_store(&xas, hpage);
> >                 continue;
> >  out_unlock:
> >                 unlock_page(page);
> >                 put_page(page);
> >                 goto xa_unlocked;
> >         }
> > -       nr = thp_nr_pages(new_page);
> > +       nr = thp_nr_pages(hpage);
> >
> >         if (is_shmem)
> > -               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> > +               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> >         else {
> > -               __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
> > +               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> >                 filemap_nr_thps_inc(mapping);
> >                 /*
> >                  * Paired with smp_mb() in do_dentry_open() to ensure
> > @@ -1837,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                 smp_mb();
> >                 if (inode_is_open_for_write(mapping->host)) {
> >                         result = SCAN_FAIL;
> > -                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
> > +                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
> >                         filemap_nr_thps_dec(mapping);
> >                         goto xa_locked;
> >                 }
> >         }
> >
> >         if (nr_none) {
> > -               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> > +               __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> >                 if (is_shmem)
> > -                       __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> > +                       __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> >         }
> >
> >         /* Join all the small entries into a single multi-index entry */
> >         xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > -       xas_store(&xas, new_page);
> > +       xas_store(&xas, hpage);
> >  xa_locked:
> >         xas_unlock_irq(&xas);
> >  xa_unlocked:
> > @@ -1873,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                 index = start;
> >                 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> >                         while (index < page->index) {
> > -                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
> > +                               clear_highpage(hpage + (index % HPAGE_PMD_NR));
> >                                 index++;
> >                         }
> > -                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> > -                                       page);
> > +                       copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
> > +                                     page);
> >                         list_del(&page->lru);
> >                         page->mapping = NULL;
> >                         page_ref_unfreeze(page, 1);
> > @@ -1888,23 +1878,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                         index++;
> >                 }
> >                 while (index < end) {
> > -                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
> > +                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
> >                         index++;
> >                 }
> >
> > -               SetPageUptodate(new_page);
> > -               page_ref_add(new_page, HPAGE_PMD_NR - 1);
> > +               SetPageUptodate(hpage);
> > +               page_ref_add(hpage, HPAGE_PMD_NR - 1);
> >                 if (is_shmem)
> > -                       set_page_dirty(new_page);
> > -               lru_cache_add(new_page);
> > +                       set_page_dirty(hpage);
> > +               lru_cache_add(hpage);
> >
> >                 /*
> >                  * Remove pte page tables, so we can re-fault the page as huge.
> >                  */
> >                 retract_page_tables(mapping, start);
> > -               *hpage = NULL;
> > -
> > -               khugepaged_pages_collapsed++;
> > +               unlock_page(hpage);
> > +               hpage = NULL;
> > +               goto out;
>
> Maybe just set hpage to NULL here, then later...
>
> >         } else {
> >                 struct page *page;
> >
> > @@ -1943,22 +1933,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file,
> >                 VM_BUG_ON(nr_none);
> >                 xas_unlock_irq(&xas);
> >
> > -               new_page->mapping = NULL;
> > +               hpage->mapping = NULL;
> >         }
> >
> > -       unlock_page(new_page);
> > +       unlock_page(hpage);
>
> do
> if (hpage)
>     unlock_page(hpage);

I don't have a strong preference here. Done.


> >  out:
> >         VM_BUG_ON(!list_empty(&pagelist));
> > -       if (!IS_ERR_OR_NULL(*hpage)) {
> > -               mem_cgroup_uncharge(page_folio(*hpage));
> > -               put_page(*hpage);
> > +       if (hpage) {
> > +               mem_cgroup_uncharge(page_folio(hpage));
> > +               put_page(hpage);
> >         }
> >         /* TODO: tracepoints */
> > +       return result;
> >  }
> >
> > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> > -                                pgoff_t start, struct page **hpage,
> > -                                struct collapse_control *cc)
> > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> > +                               pgoff_t start, struct collapse_control *cc)
> >  {
> >         struct page *page = NULL;
> >         struct address_space *mapping = file->f_mapping;
> > @@ -2031,15 +2021,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> >                         result = SCAN_EXCEED_NONE_PTE;
> >                         count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> >                 } else {
> > -                       collapse_file(mm, file, start, hpage, cc);
> > +                       result = collapse_file(mm, file, start, cc);
> >                 }
> >         }
> >
> >         /* TODO: tracepoints */
> > +       return result;
> >  }
> >  #else
> > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
> > -                                pgoff_t start, struct collapse_control *cc)
> > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start,
> > +                               struct collapse_control *cc)
> >  {
> >         BUILD_BUG();
> >  }
> > @@ -2049,8 +2040,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
> >  }
> >  #endif
> >
> > -static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> > -                                           struct page **hpage,
> > +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> >                                             struct collapse_control *cc)
> >         __releases(&khugepaged_mm_lock)
> >         __acquires(&khugepaged_mm_lock)
> > @@ -2064,6 +2054,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> >
> >         VM_BUG_ON(!pages);
> >         lockdep_assert_held(&khugepaged_mm_lock);
> > +       *result = SCAN_FAIL;
> >
> >         if (khugepaged_scan.mm_slot)
> >                 mm_slot = khugepaged_scan.mm_slot;
> > @@ -2117,7 +2108,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> >                         goto skip;
> >
> >                 while (khugepaged_scan.address < hend) {
> > -                       int ret;
> > +                       bool mmap_locked = true;
> > +
> >                         cond_resched();
> >                         if (unlikely(khugepaged_test_exit(mm)))
> >                                 goto breakouterloop;
> > @@ -2134,20 +2126,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> >                                                 khugepaged_scan.address);
> >
> >                                 mmap_read_unlock(mm);
> > -                               ret = 1;
> > -                               khugepaged_scan_file(mm, file, pgoff, hpage,
> > -                                                    cc);
> > +                               mmap_locked = false;
> > +                               *result = khugepaged_scan_file(mm, file, pgoff,
> > +                                                              cc);
> >                                 fput(file);
> >                         } else {
> > -                               ret = khugepaged_scan_pmd(mm, vma,
> > -                                               khugepaged_scan.address,
> > -                                               hpage, cc);
> > +                               *result = khugepaged_scan_pmd(mm, vma,
> > +                                                             khugepaged_scan.address,
> > +                                                             &mmap_locked, cc);
> >                         }
> > +                       if (*result == SCAN_SUCCEED)
> > +                               ++khugepaged_pages_collapsed;
> >                         /* move to next address */
> >                         khugepaged_scan.address += HPAGE_PMD_SIZE;
> >                         progress += HPAGE_PMD_NR;
> > -                       if (ret)
> > -                               /* we released mmap_lock so break loop */
> > +                       if (!mmap_locked)
> > +                               /*
> > +                                * We released mmap_lock so break loop.  Note
> > +                                * that we drop mmap_lock before all hugepage
> > +                                * allocations, so if allocation fails, we are
> > +                                * guaranteed to break here and report the
> > +                                * correct result back to caller.
> > +                                */
> >                                 goto breakouterloop_mmap_lock;
> >                         if (progress >= pages)
> >                                 goto breakouterloop;
> > @@ -2199,15 +2199,15 @@ static int khugepaged_wait_event(void)
> >
> >  static void khugepaged_do_scan(struct collapse_control *cc)
> >  {
> > -       struct page *hpage = NULL;
> >         unsigned int progress = 0, pass_through_head = 0;
> >         unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
> >         bool wait = true;
> > +       int result = SCAN_SUCCEED;
> >
> >         lru_add_drain_all();
> >
> >         while (progress < pages) {
> > -               if (alloc_fail_should_sleep(&hpage, &wait))
> > +               if (alloc_fail_should_sleep(result, &wait))
> >                         break;
> >
> >                 cond_resched();
> > @@ -2221,7 +2221,7 @@ static void khugepaged_do_scan(struct collapse_control *cc)
> >                 if (khugepaged_has_work() &&
> >                     pass_through_head < 2)
> >                         progress += khugepaged_scan_mm_slot(pages - progress,
> > -                                                           &hpage, cc);
> > +                                                           &result, cc);
> >                 else
> >                         progress = pages;
> >                 spin_unlock(&khugepaged_mm_lock);
> > --
> > 2.36.1.255.ge46751e96f-goog
> >
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba722347bebd..03e0da0008f1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -722,13 +722,13 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		result = SCAN_SUCCEED;
 		trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 						    referenced, writable, result);
-		return 1;
+		return SCAN_SUCCEED;
 	}
 out:
 	release_pte_pages(pte, _pte, compound_pagelist);
 	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 					    referenced, writable, result);
-	return 0;
+	return result;
 }
 
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -850,14 +850,13 @@  static int khugepaged_find_target_node(struct collapse_control *cc)
 #endif
 
 /* Sleep for the first alloc fail, break the loop for the second fail */
-static bool alloc_fail_should_sleep(struct page **hpage, bool *wait)
+static bool alloc_fail_should_sleep(int result, bool *wait)
 {
-	if (IS_ERR(*hpage)) {
+	if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
 		if (!*wait)
 			return true;
 
 		*wait = false;
-		*hpage = NULL;
 		khugepaged_alloc_sleep();
 	}
 	return false;
@@ -868,7 +867,6 @@  static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-		*hpage = ERR_PTR(-ENOMEM);
 		return false;
 	}
 
@@ -1010,17 +1008,17 @@  static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage,
 	return SCAN_SUCCEED;
 }
 
-static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
-			       struct page **hpage, int referenced,
-			       int unmapped, struct collapse_control *cc)
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+			      int referenced, int unmapped,
+			      struct collapse_control *cc)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
 	pgtable_t pgtable;
-	struct page *new_page;
+	struct page *hpage;
 	spinlock_t *pmd_ptl, *pte_ptl;
-	int isolated = 0, result = 0;
+	int result = SCAN_FAIL;
 	struct vm_area_struct *vma;
 	struct mmu_notifier_range range;
 
@@ -1034,12 +1032,10 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 */
 	mmap_read_unlock(mm);
 
-	result = alloc_charge_hpage(mm, hpage, cc);
+	result = alloc_charge_hpage(mm, &hpage, cc);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
-	new_page = *hpage;
-
 	mmap_read_lock(mm);
 	result = hugepage_vma_revalidate(mm, address, &vma);
 	if (result) {
@@ -1100,11 +1096,11 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	mmu_notifier_invalidate_range_end(&range);
 
 	spin_lock(pte_ptl);
-	isolated = __collapse_huge_page_isolate(vma, address, pte,
-			&compound_pagelist);
+	result =  __collapse_huge_page_isolate(vma, address, pte,
+					       &compound_pagelist);
 	spin_unlock(pte_ptl);
 
-	if (unlikely(!isolated)) {
+	if (unlikely(result != SCAN_SUCCEED)) {
 		pte_unmap(pte);
 		spin_lock(pmd_ptl);
 		BUG_ON(!pmd_none(*pmd));
@@ -1116,7 +1112,6 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
 		spin_unlock(pmd_ptl);
 		anon_vma_unlock_write(vma->anon_vma);
-		result = SCAN_FAIL;
 		goto out_up_write;
 	}
 
@@ -1126,8 +1121,8 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 */
 	anon_vma_unlock_write(vma->anon_vma);
 
-	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
-			&compound_pagelist);
+	__collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
+				  &compound_pagelist);
 	pte_unmap(pte);
 	/*
 	 * spin_lock() below is not the equivalent of smp_wmb(), but
@@ -1135,43 +1130,42 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * avoid the copy_huge_page writes to become visible after
 	 * the set_pmd_at() write.
 	 */
-	__SetPageUptodate(new_page);
+	__SetPageUptodate(hpage);
 	pgtable = pmd_pgtable(_pmd);
 
-	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+	_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
 	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
-	page_add_new_anon_rmap(new_page, vma, address);
-	lru_cache_add_inactive_or_unevictable(new_page, vma);
+	page_add_new_anon_rmap(hpage, vma, address);
+	lru_cache_add_inactive_or_unevictable(hpage, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
 	spin_unlock(pmd_ptl);
 
-	*hpage = NULL;
+	hpage = NULL;
 
-	khugepaged_pages_collapsed++;
 	result = SCAN_SUCCEED;
 out_up_write:
 	mmap_write_unlock(mm);
 out_nolock:
-	if (!IS_ERR_OR_NULL(*hpage)) {
-		mem_cgroup_uncharge(page_folio(*hpage));
-		put_page(*hpage);
+	if (hpage) {
+		mem_cgroup_uncharge(page_folio(hpage));
+		put_page(hpage);
 	}
-	trace_mm_collapse_huge_page(mm, isolated, result);
-	return;
+	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+	return result;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
-			       unsigned long address, struct page **hpage,
+			       unsigned long address, bool *mmap_locked,
 			       struct collapse_control *cc)
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
-	int ret = 0, result = 0, referenced = 0;
+	int result = SCAN_FAIL, referenced = 0;
 	int none_or_zero = 0, shared = 0;
 	struct page *page = NULL;
 	unsigned long _address;
@@ -1306,19 +1300,19 @@  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 		result = SCAN_LACK_REFERENCED_PAGE;
 	} else {
 		result = SCAN_SUCCEED;
-		ret = 1;
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
-	if (ret) {
+	if (result == SCAN_SUCCEED) {
 		/* collapse_huge_page will return with the mmap_lock released */
-		collapse_huge_page(mm, address, hpage, referenced, unmapped,
-				   cc);
+		*mmap_locked = false;
+		result = collapse_huge_page(mm, address, referenced,
+					    unmapped, cc);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
 				     none_or_zero, result, unmapped);
-	return ret;
+	return result;
 }
 
 static void collect_mm_slot(struct mm_slot *mm_slot)
@@ -1581,7 +1575,6 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  * @mm: process address space where collapse happens
  * @file: file that collapse on
  * @start: collapse start address
- * @hpage: new allocated huge page for collapse
  * @cc: collapse context and scratchpad
  *
  * Basic scheme is simple, details are more complex:
@@ -1599,12 +1592,11 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
-static void collapse_file(struct mm_struct *mm, struct file *file,
-			  pgoff_t start, struct page **hpage,
-			  struct collapse_control *cc)
+static int collapse_file(struct mm_struct *mm, struct file *file,
+			 pgoff_t start, struct collapse_control *cc)
 {
 	struct address_space *mapping = file->f_mapping;
-	struct page *new_page;
+	struct page *hpage;
 	pgoff_t index, end = start + HPAGE_PMD_NR;
 	LIST_HEAD(pagelist);
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1615,12 +1607,10 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
-	result = alloc_charge_hpage(mm, hpage, cc);
+	result = alloc_charge_hpage(mm, &hpage, cc);
 	if (result != SCAN_SUCCEED)
 		goto out;
 
-	new_page = *hpage;
-
 	/*
 	 * Ensure we have slots for all the pages in the range.  This is
 	 * almost certainly a no-op because most of the pages must be present
@@ -1637,14 +1627,14 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		}
 	} while (1);
 
-	__SetPageLocked(new_page);
+	__SetPageLocked(hpage);
 	if (is_shmem)
-		__SetPageSwapBacked(new_page);
-	new_page->index = start;
-	new_page->mapping = mapping;
+		__SetPageSwapBacked(hpage);
+	hpage->index = start;
+	hpage->mapping = mapping;
 
 	/*
-	 * At this point the new_page is locked and not up-to-date.
+	 * At this point the hpage is locked and not up-to-date.
 	 * It's safe to insert it into the page cache, because nobody would
 	 * be able to map it or use it in another way until we unlock it.
 	 */
@@ -1672,7 +1662,7 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 					result = SCAN_FAIL;
 					goto xa_locked;
 				}
-				xas_store(&xas, new_page);
+				xas_store(&xas, hpage);
 				nr_none++;
 				continue;
 			}
@@ -1814,19 +1804,19 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		list_add_tail(&page->lru, &pagelist);
 
 		/* Finally, replace with the new page. */
-		xas_store(&xas, new_page);
+		xas_store(&xas, hpage);
 		continue;
 out_unlock:
 		unlock_page(page);
 		put_page(page);
 		goto xa_unlocked;
 	}
-	nr = thp_nr_pages(new_page);
+	nr = thp_nr_pages(hpage);
 
 	if (is_shmem)
-		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
 	else {
-		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
+		__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
 		/*
 		 * Paired with smp_mb() in do_dentry_open() to ensure
@@ -1837,21 +1827,21 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		smp_mb();
 		if (inode_is_open_for_write(mapping->host)) {
 			result = SCAN_FAIL;
-			__mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+			__mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
 			filemap_nr_thps_dec(mapping);
 			goto xa_locked;
 		}
 	}
 
 	if (nr_none) {
-		__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+		__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
 		if (is_shmem)
-			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+			__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
 	}
 
 	/* Join all the small entries into a single multi-index entry */
 	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
-	xas_store(&xas, new_page);
+	xas_store(&xas, hpage);
 xa_locked:
 	xas_unlock_irq(&xas);
 xa_unlocked:
@@ -1873,11 +1863,11 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		index = start;
 		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
 			while (index < page->index) {
-				clear_highpage(new_page + (index % HPAGE_PMD_NR));
+				clear_highpage(hpage + (index % HPAGE_PMD_NR));
 				index++;
 			}
-			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
-					page);
+			copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
+				      page);
 			list_del(&page->lru);
 			page->mapping = NULL;
 			page_ref_unfreeze(page, 1);
@@ -1888,23 +1878,23 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 			index++;
 		}
 		while (index < end) {
-			clear_highpage(new_page + (index % HPAGE_PMD_NR));
+			clear_highpage(hpage + (index % HPAGE_PMD_NR));
 			index++;
 		}
 
-		SetPageUptodate(new_page);
-		page_ref_add(new_page, HPAGE_PMD_NR - 1);
+		SetPageUptodate(hpage);
+		page_ref_add(hpage, HPAGE_PMD_NR - 1);
 		if (is_shmem)
-			set_page_dirty(new_page);
-		lru_cache_add(new_page);
+			set_page_dirty(hpage);
+		lru_cache_add(hpage);
 
 		/*
 		 * Remove pte page tables, so we can re-fault the page as huge.
 		 */
 		retract_page_tables(mapping, start);
-		*hpage = NULL;
-
-		khugepaged_pages_collapsed++;
+		unlock_page(hpage);
+		hpage = NULL;
+		goto out;
 	} else {
 		struct page *page;
 
@@ -1943,22 +1933,22 @@  static void collapse_file(struct mm_struct *mm, struct file *file,
 		VM_BUG_ON(nr_none);
 		xas_unlock_irq(&xas);
 
-		new_page->mapping = NULL;
+		hpage->mapping = NULL;
 	}
 
-	unlock_page(new_page);
+	unlock_page(hpage);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	if (!IS_ERR_OR_NULL(*hpage)) {
-		mem_cgroup_uncharge(page_folio(*hpage));
-		put_page(*hpage);
+	if (hpage) {
+		mem_cgroup_uncharge(page_folio(hpage));
+		put_page(hpage);
 	}
 	/* TODO: tracepoints */
+	return result;
 }
 
-static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
-				 pgoff_t start, struct page **hpage,
-				 struct collapse_control *cc)
+static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
+				pgoff_t start, struct collapse_control *cc)
 {
 	struct page *page = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -2031,15 +2021,16 @@  static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-			collapse_file(mm, file, start, hpage, cc);
+			result = collapse_file(mm, file, start, cc);
 		}
 	}
 
 	/* TODO: tracepoints */
+	return result;
 }
 #else
-static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
-				 pgoff_t start, struct collapse_control *cc)
+static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start,
+				struct collapse_control *cc)
 {
 	BUILD_BUG();
 }
@@ -2049,8 +2040,7 @@  static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 }
 #endif
 
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-					    struct page **hpage,
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
@@ -2064,6 +2054,7 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
 	VM_BUG_ON(!pages);
 	lockdep_assert_held(&khugepaged_mm_lock);
+	*result = SCAN_FAIL;
 
 	if (khugepaged_scan.mm_slot)
 		mm_slot = khugepaged_scan.mm_slot;
@@ -2117,7 +2108,8 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			goto skip;
 
 		while (khugepaged_scan.address < hend) {
-			int ret;
+			bool mmap_locked = true;
+
 			cond_resched();
 			if (unlikely(khugepaged_test_exit(mm)))
 				goto breakouterloop;
@@ -2134,20 +2126,28 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 						khugepaged_scan.address);
 
 				mmap_read_unlock(mm);
-				ret = 1;
-				khugepaged_scan_file(mm, file, pgoff, hpage,
-						     cc);
+				mmap_locked = false;
+				*result = khugepaged_scan_file(mm, file, pgoff,
+							       cc);
 				fput(file);
 			} else {
-				ret = khugepaged_scan_pmd(mm, vma,
-						khugepaged_scan.address,
-						hpage, cc);
+				*result = khugepaged_scan_pmd(mm, vma,
+							      khugepaged_scan.address,
+							      &mmap_locked, cc);
 			}
+			if (*result == SCAN_SUCCEED)
+				++khugepaged_pages_collapsed;
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
 			progress += HPAGE_PMD_NR;
-			if (ret)
-				/* we released mmap_lock so break loop */
+			if (!mmap_locked)
+				/*
+				 * We released mmap_lock so break loop.  Note
+				 * that we drop mmap_lock before all hugepage
+				 * allocations, so if allocation fails, we are
+				 * guaranteed to break here and report the
+				 * correct result back to caller.
+				 */
 				goto breakouterloop_mmap_lock;
 			if (progress >= pages)
 				goto breakouterloop;
@@ -2199,15 +2199,15 @@  static int khugepaged_wait_event(void)
 
 static void khugepaged_do_scan(struct collapse_control *cc)
 {
-	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
 	bool wait = true;
+	int result = SCAN_SUCCEED;
 
 	lru_add_drain_all();
 
 	while (progress < pages) {
-		if (alloc_fail_should_sleep(&hpage, &wait))
+		if (alloc_fail_should_sleep(result, &wait))
 			break;
 
 		cond_resched();
@@ -2221,7 +2221,7 @@  static void khugepaged_do_scan(struct collapse_control *cc)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    &hpage, cc);
+							    &result, cc);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);