Message ID | 20220604004004.954674-7-zokeefe@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: userspace hugepage collapse | expand |
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote: > > Pipe enum scan_result codes back through return values of functions > downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to > inform callers if the operation was successful, and if not, why. > > Since khugepaged_scan_pmd()'s return value already has a specific > meaning (whether mmap_lock was unlocked or not), add a bool* argument > to khugepaged_scan_pmd() to retrieve this information. > > Change khugepaged to take action based on the return values of > khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting > deep within the collapsing functions themselves. > > Remove dependency on error pointers to communicate to khugepaged that > allocation failed and it should sleep; instead just use the result of > the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails). > > Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> A couple of minor nits below... > --- > mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------ > 1 file changed, 96 insertions(+), 96 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index ba722347bebd..03e0da0008f1 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > result = SCAN_SUCCEED; > trace_mm_collapse_huge_page_isolate(page, none_or_zero, > referenced, writable, result); > - return 1; > + return SCAN_SUCCEED; You could do "return result" too. > } > out: > release_pte_pages(pte, _pte, compound_pagelist); > trace_mm_collapse_huge_page_isolate(page, none_or_zero, > referenced, writable, result); > - return 0; > + return result; > } > > static void __collapse_huge_page_copy(pte_t *pte, struct page *page, > @@ -850,14 +850,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc) > #endif > > /* Sleep for the first alloc fail, break the loop for the second fail */ > -static bool alloc_fail_should_sleep(struct page **hpage, bool *wait) > +static bool alloc_fail_should_sleep(int result, bool *wait) > { > - if (IS_ERR(*hpage)) { > + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { > if (!*wait) > return true; > > *wait = false; > - *hpage = NULL; > khugepaged_alloc_sleep(); > } > return false; > @@ -868,7 +867,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); > if (unlikely(!*hpage)) { > count_vm_event(THP_COLLAPSE_ALLOC_FAILED); > - *hpage = ERR_PTR(-ENOMEM); > return false; > } > > @@ -1010,17 +1008,17 @@ static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage, > return SCAN_SUCCEED; > } > > -static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > - struct page **hpage, int referenced, > - int unmapped, struct collapse_control *cc) > +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > + int referenced, int unmapped, > + struct collapse_control *cc) > { > LIST_HEAD(compound_pagelist); > pmd_t *pmd, _pmd; > pte_t *pte; > pgtable_t pgtable; > - struct page *new_page; > + struct page *hpage; > spinlock_t *pmd_ptl, *pte_ptl; > - int isolated = 0, result = 0; > + int result = SCAN_FAIL; > struct vm_area_struct *vma; > struct mmu_notifier_range range; > > @@ -1034,12 +1032,10 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > */ > mmap_read_unlock(mm); > > - result = alloc_charge_hpage(mm, hpage, cc); > + result = alloc_charge_hpage(mm, &hpage, cc); > if (result != SCAN_SUCCEED) > goto out_nolock; > > - new_page = *hpage; > - > mmap_read_lock(mm); > result = hugepage_vma_revalidate(mm, address, &vma); > if (result) { > @@ -1100,11 +1096,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > mmu_notifier_invalidate_range_end(&range); > > spin_lock(pte_ptl); > - isolated = __collapse_huge_page_isolate(vma, address, pte, > - &compound_pagelist); > + result = __collapse_huge_page_isolate(vma, address, pte, > + &compound_pagelist); > spin_unlock(pte_ptl); > > - if (unlikely(!isolated)) { > + if (unlikely(result != SCAN_SUCCEED)) { > pte_unmap(pte); > spin_lock(pmd_ptl); > BUG_ON(!pmd_none(*pmd)); > @@ -1116,7 +1112,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > pmd_populate(mm, pmd, pmd_pgtable(_pmd)); > spin_unlock(pmd_ptl); > anon_vma_unlock_write(vma->anon_vma); > - result = SCAN_FAIL; > goto out_up_write; > } > > @@ -1126,8 +1121,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > */ > anon_vma_unlock_write(vma->anon_vma); > > - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, > - &compound_pagelist); > + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, > + &compound_pagelist); > pte_unmap(pte); > /* > * spin_lock() below is not the equivalent of smp_wmb(), but > @@ -1135,43 +1130,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > * avoid the copy_huge_page writes to become visible after > * the set_pmd_at() write. > */ > - __SetPageUptodate(new_page); > + __SetPageUptodate(hpage); > pgtable = pmd_pgtable(_pmd); > > - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); > + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); > _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); > > spin_lock(pmd_ptl); > BUG_ON(!pmd_none(*pmd)); > - page_add_new_anon_rmap(new_page, vma, address); > - lru_cache_add_inactive_or_unevictable(new_page, vma); > + page_add_new_anon_rmap(hpage, vma, address); > + lru_cache_add_inactive_or_unevictable(hpage, vma); > pgtable_trans_huge_deposit(mm, pmd, pgtable); > set_pmd_at(mm, address, pmd, _pmd); > update_mmu_cache_pmd(vma, address, pmd); > spin_unlock(pmd_ptl); > > - *hpage = NULL; > + hpage = NULL; > > - khugepaged_pages_collapsed++; > result = SCAN_SUCCEED; > out_up_write: > mmap_write_unlock(mm); > out_nolock: > - if (!IS_ERR_OR_NULL(*hpage)) { > - mem_cgroup_uncharge(page_folio(*hpage)); > - put_page(*hpage); > + if (hpage) { > + mem_cgroup_uncharge(page_folio(hpage)); > + put_page(hpage); > } > - trace_mm_collapse_huge_page(mm, isolated, result); > - return; > + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); > + return result; > } > > static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, > - unsigned long address, struct page **hpage, > + unsigned long address, bool *mmap_locked, > struct collapse_control *cc) > { > pmd_t *pmd; > pte_t *pte, *_pte; > - int ret = 0, result = 0, referenced = 0; > + int result = SCAN_FAIL, referenced = 0; > int none_or_zero = 0, shared = 0; > struct page *page = NULL; > unsigned long _address; > @@ -1306,19 +1300,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, > result = SCAN_LACK_REFERENCED_PAGE; > } else { > result = SCAN_SUCCEED; > - ret = 1; > } > out_unmap: > pte_unmap_unlock(pte, ptl); > - if (ret) { > + if (result == SCAN_SUCCEED) { > /* collapse_huge_page will return with the mmap_lock released */ > - collapse_huge_page(mm, address, hpage, referenced, unmapped, > - cc); > + *mmap_locked = false; > + result = collapse_huge_page(mm, address, referenced, > + unmapped, cc); Shall move "*mmap_locked = false" after collapse_huge_page() call? No functional change, but seems more consistent. > } > out: > trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, > none_or_zero, result, unmapped); > - return ret; > + return result; > } > > static void collect_mm_slot(struct mm_slot *mm_slot) > @@ -1581,7 +1575,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > * @mm: process address space where collapse happens > * @file: file that collapse on > * @start: collapse start address > - * @hpage: new allocated huge page for collapse > * @cc: collapse context and scratchpad > * > * Basic scheme is simple, details are more complex: > @@ -1599,12 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > * + restore gaps in the page cache; > * + unlock and free huge page; > */ > -static void collapse_file(struct mm_struct *mm, struct file *file, > - pgoff_t start, struct page **hpage, > - struct collapse_control *cc) > +static int collapse_file(struct mm_struct *mm, struct file *file, > + pgoff_t start, struct collapse_control *cc) > { > struct address_space *mapping = file->f_mapping; > - struct page *new_page; > + struct page *hpage; > pgoff_t index, end = start + HPAGE_PMD_NR; > LIST_HEAD(pagelist); > XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); > @@ -1615,12 +1607,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); > VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); > > - result = alloc_charge_hpage(mm, hpage, cc); > + result = alloc_charge_hpage(mm, &hpage, cc); > if (result != SCAN_SUCCEED) > goto out; > > - new_page = *hpage; > - > /* > * Ensure we have slots for all the pages in the range. This is > * almost certainly a no-op because most of the pages must be present > @@ -1637,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > } > } while (1); > > - __SetPageLocked(new_page); > + __SetPageLocked(hpage); > if (is_shmem) > - __SetPageSwapBacked(new_page); > - new_page->index = start; > - new_page->mapping = mapping; > + __SetPageSwapBacked(hpage); > + hpage->index = start; > + hpage->mapping = mapping; > > /* > - * At this point the new_page is locked and not up-to-date. > + * At this point the hpage is locked and not up-to-date. > * It's safe to insert it into the page cache, because nobody would > * be able to map it or use it in another way until we unlock it. > */ > @@ -1672,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > result = SCAN_FAIL; > goto xa_locked; > } > - xas_store(&xas, new_page); > + xas_store(&xas, hpage); > nr_none++; > continue; > } > @@ -1814,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > list_add_tail(&page->lru, &pagelist); > > /* Finally, replace with the new page. */ > - xas_store(&xas, new_page); > + xas_store(&xas, hpage); > continue; > out_unlock: > unlock_page(page); > put_page(page); > goto xa_unlocked; > } > - nr = thp_nr_pages(new_page); > + nr = thp_nr_pages(hpage); > > if (is_shmem) > - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); > + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > else { > - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > filemap_nr_thps_inc(mapping); > /* > * Paired with smp_mb() in do_dentry_open() to ensure > @@ -1837,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > smp_mb(); > if (inode_is_open_for_write(mapping->host)) { > result = SCAN_FAIL; > - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); > filemap_nr_thps_dec(mapping); > goto xa_locked; > } > } > > if (nr_none) { > - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); > + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > if (is_shmem) > - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); > + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > } > > /* Join all the small entries into a single multi-index entry */ > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > - xas_store(&xas, new_page); > + xas_store(&xas, hpage); > xa_locked: > xas_unlock_irq(&xas); > xa_unlocked: > @@ -1873,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > index = start; > list_for_each_entry_safe(page, tmp, &pagelist, lru) { > while (index < page->index) { > - clear_highpage(new_page + (index % HPAGE_PMD_NR)); > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > index++; > } > - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), > - page); > + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), > + page); > list_del(&page->lru); > page->mapping = NULL; > page_ref_unfreeze(page, 1); > @@ -1888,23 +1878,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > index++; > } > while (index < end) { > - clear_highpage(new_page + (index % HPAGE_PMD_NR)); > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > index++; > } > > - SetPageUptodate(new_page); > - page_ref_add(new_page, HPAGE_PMD_NR - 1); > + SetPageUptodate(hpage); > + page_ref_add(hpage, HPAGE_PMD_NR - 1); > if (is_shmem) > - set_page_dirty(new_page); > - lru_cache_add(new_page); > + set_page_dirty(hpage); > + lru_cache_add(hpage); > > /* > * Remove pte page tables, so we can re-fault the page as huge. > */ > retract_page_tables(mapping, start); > - *hpage = NULL; > - > - khugepaged_pages_collapsed++; > + unlock_page(hpage); > + hpage = NULL; > + goto out; Maybe just set hpage to NULL here, then later... > } else { > struct page *page; > > @@ -1943,22 +1933,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > VM_BUG_ON(nr_none); > xas_unlock_irq(&xas); > > - new_page->mapping = NULL; > + hpage->mapping = NULL; > } > > - unlock_page(new_page); > + unlock_page(hpage); do if (hpage) unlock_page(hpage); > out: > VM_BUG_ON(!list_empty(&pagelist)); > - if (!IS_ERR_OR_NULL(*hpage)) { > - mem_cgroup_uncharge(page_folio(*hpage)); > - put_page(*hpage); > + if (hpage) { > + mem_cgroup_uncharge(page_folio(hpage)); > + put_page(hpage); > } > /* TODO: tracepoints */ > + return result; > } > > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > - pgoff_t start, struct page **hpage, > - struct collapse_control *cc) > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, > + pgoff_t start, struct collapse_control *cc) > { > struct page *page = NULL; > struct address_space *mapping = file->f_mapping; > @@ -2031,15 +2021,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > result = SCAN_EXCEED_NONE_PTE; > count_vm_event(THP_SCAN_EXCEED_NONE_PTE); > } else { > - collapse_file(mm, file, start, hpage, cc); > + result = collapse_file(mm, file, start, cc); > } > } > > /* TODO: tracepoints */ > + return result; > } > #else > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > - pgoff_t start, struct collapse_control *cc) > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, > + struct collapse_control *cc) > { > BUILD_BUG(); > } > @@ -2049,8 +2040,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) > } > #endif > > -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > - struct page **hpage, > +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > struct collapse_control *cc) > __releases(&khugepaged_mm_lock) > __acquires(&khugepaged_mm_lock) > @@ -2064,6 +2054,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > > VM_BUG_ON(!pages); > lockdep_assert_held(&khugepaged_mm_lock); > + *result = SCAN_FAIL; > > if (khugepaged_scan.mm_slot) > mm_slot = khugepaged_scan.mm_slot; > @@ -2117,7 +2108,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > goto skip; > > while (khugepaged_scan.address < hend) { > - int ret; > + bool mmap_locked = true; > + > cond_resched(); > if (unlikely(khugepaged_test_exit(mm))) > goto breakouterloop; > @@ -2134,20 +2126,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > khugepaged_scan.address); > > mmap_read_unlock(mm); > - ret = 1; > - khugepaged_scan_file(mm, file, pgoff, hpage, > - cc); > + mmap_locked = false; > + *result = khugepaged_scan_file(mm, file, pgoff, > + cc); > fput(file); > } else { > - ret = khugepaged_scan_pmd(mm, vma, > - khugepaged_scan.address, > - hpage, cc); > + *result = khugepaged_scan_pmd(mm, vma, > + khugepaged_scan.address, > + &mmap_locked, cc); > } > + if (*result == SCAN_SUCCEED) > + ++khugepaged_pages_collapsed; > /* move to next address */ > khugepaged_scan.address += HPAGE_PMD_SIZE; > progress += HPAGE_PMD_NR; > - if (ret) > - /* we released mmap_lock so break loop */ > + if (!mmap_locked) > + /* > + * We released mmap_lock so break loop. Note > + * that we drop mmap_lock before all hugepage > + * allocations, so if allocation fails, we are > + * guaranteed to break here and report the > + * correct result back to caller. > + */ > goto breakouterloop_mmap_lock; > if (progress >= pages) > goto breakouterloop; > @@ -2199,15 +2199,15 @@ static int khugepaged_wait_event(void) > > static void khugepaged_do_scan(struct collapse_control *cc) > { > - struct page *hpage = NULL; > unsigned int progress = 0, pass_through_head = 0; > unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); > bool wait = true; > + int result = SCAN_SUCCEED; > > lru_add_drain_all(); > > while (progress < pages) { > - if (alloc_fail_should_sleep(&hpage, &wait)) > + if (alloc_fail_should_sleep(result, &wait)) > break; > > cond_resched(); > @@ -2221,7 +2221,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) > if (khugepaged_has_work() && > pass_through_head < 2) > progress += khugepaged_scan_mm_slot(pages - progress, > - &hpage, cc); > + &result, cc); > else > progress = pages; > spin_unlock(&khugepaged_mm_lock); > -- > 2.36.1.255.ge46751e96f-goog >
On Mon, Jun 6, 2022 at 3:40 PM Yang Shi <shy828301@gmail.com> wrote: > > On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote: > > > > Pipe enum scan_result codes back through return values of functions > > downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to > > inform callers if the operation was successful, and if not, why. > > > > Since khugepaged_scan_pmd()'s return value already has a specific > > meaning (whether mmap_lock was unlocked or not), add a bool* argument > > to khugepaged_scan_pmd() to retrieve this information. > > > > Change khugepaged to take action based on the return values of > > khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting > > deep within the collapsing functions themselves. > > > > Remove dependency on error pointers to communicate to khugepaged that > > allocation failed and it should sleep; instead just use the result of > > the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails). > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com> > > Reviewed-by: Yang Shi <shy828301@gmail.com> Hey Yang, Thanks for taking the time to review this series. Very much appreciated! > A couple of minor nits below... > > > --- > > mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------ > > 1 file changed, 96 insertions(+), 96 deletions(-) > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index ba722347bebd..03e0da0008f1 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > > result = SCAN_SUCCEED; > > trace_mm_collapse_huge_page_isolate(page, none_or_zero, > > referenced, writable, result); > > - return 1; > > + return SCAN_SUCCEED; > > You could do "return result" too. Thanks for catching. Done. > > } > > out: > > release_pte_pages(pte, _pte, compound_pagelist); > > trace_mm_collapse_huge_page_isolate(page, none_or_zero, > > referenced, writable, result); > > - return 0; > > + return result; > > } > > > > static void __collapse_huge_page_copy(pte_t *pte, struct page *page, > > @@ -850,14 +850,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc) > > #endif > > > > /* Sleep for the first alloc fail, break the loop for the second fail */ > > -static bool alloc_fail_should_sleep(struct page **hpage, bool *wait) > > +static bool alloc_fail_should_sleep(int result, bool *wait) > > { > > - if (IS_ERR(*hpage)) { > > + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { > > if (!*wait) > > return true; > > > > *wait = false; > > - *hpage = NULL; > > khugepaged_alloc_sleep(); > > } > > return false; > > @@ -868,7 +867,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > > *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); > > if (unlikely(!*hpage)) { > > count_vm_event(THP_COLLAPSE_ALLOC_FAILED); > > - *hpage = ERR_PTR(-ENOMEM); > > return false; > > } > > > > @@ -1010,17 +1008,17 @@ static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage, > > return SCAN_SUCCEED; > > } > > > > -static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > - struct page **hpage, int referenced, > > - int unmapped, struct collapse_control *cc) > > +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > + int referenced, int unmapped, > > + struct collapse_control *cc) > > { > > LIST_HEAD(compound_pagelist); > > pmd_t *pmd, _pmd; > > pte_t *pte; > > pgtable_t pgtable; > > - struct page *new_page; > > + struct page *hpage; > > spinlock_t *pmd_ptl, *pte_ptl; > > - int isolated = 0, result = 0; > > + int result = SCAN_FAIL; > > struct vm_area_struct *vma; > > struct mmu_notifier_range range; > > > > @@ -1034,12 +1032,10 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > */ > > mmap_read_unlock(mm); > > > > - result = alloc_charge_hpage(mm, hpage, cc); > > + result = alloc_charge_hpage(mm, &hpage, cc); > > if (result != SCAN_SUCCEED) > > goto out_nolock; > > > > - new_page = *hpage; > > - > > mmap_read_lock(mm); > > result = hugepage_vma_revalidate(mm, address, &vma); > > if (result) { > > @@ -1100,11 +1096,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > mmu_notifier_invalidate_range_end(&range); > > > > spin_lock(pte_ptl); > > - isolated = __collapse_huge_page_isolate(vma, address, pte, > > - &compound_pagelist); > > + result = __collapse_huge_page_isolate(vma, address, pte, > > + &compound_pagelist); > > spin_unlock(pte_ptl); > > > > - if (unlikely(!isolated)) { > > + if (unlikely(result != SCAN_SUCCEED)) { > > pte_unmap(pte); > > spin_lock(pmd_ptl); > > BUG_ON(!pmd_none(*pmd)); > > @@ -1116,7 +1112,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > pmd_populate(mm, pmd, pmd_pgtable(_pmd)); > > spin_unlock(pmd_ptl); > > anon_vma_unlock_write(vma->anon_vma); > > - result = SCAN_FAIL; > > goto out_up_write; > > } > > > > @@ -1126,8 +1121,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > */ > > anon_vma_unlock_write(vma->anon_vma); > > > > - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, > > - &compound_pagelist); > > + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, > > + &compound_pagelist); > > pte_unmap(pte); > > /* > > * spin_lock() below is not the equivalent of smp_wmb(), but > > @@ -1135,43 +1130,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, > > * avoid the copy_huge_page writes to become visible after > > * the set_pmd_at() write. > > */ > > - __SetPageUptodate(new_page); > > + __SetPageUptodate(hpage); > > pgtable = pmd_pgtable(_pmd); > > > > - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); > > + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); > > _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); > > > > spin_lock(pmd_ptl); > > BUG_ON(!pmd_none(*pmd)); > > - page_add_new_anon_rmap(new_page, vma, address); > > - lru_cache_add_inactive_or_unevictable(new_page, vma); > > + page_add_new_anon_rmap(hpage, vma, address); > > + lru_cache_add_inactive_or_unevictable(hpage, vma); > > pgtable_trans_huge_deposit(mm, pmd, pgtable); > > set_pmd_at(mm, address, pmd, _pmd); > > update_mmu_cache_pmd(vma, address, pmd); > > spin_unlock(pmd_ptl); > > > > - *hpage = NULL; > > + hpage = NULL; > > > > - khugepaged_pages_collapsed++; > > result = SCAN_SUCCEED; > > out_up_write: > > mmap_write_unlock(mm); > > out_nolock: > > - if (!IS_ERR_OR_NULL(*hpage)) { > > - mem_cgroup_uncharge(page_folio(*hpage)); > > - put_page(*hpage); > > + if (hpage) { > > + mem_cgroup_uncharge(page_folio(hpage)); > > + put_page(hpage); > > } > > - trace_mm_collapse_huge_page(mm, isolated, result); > > - return; > > + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); > > + return result; > > } > > > > static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, > > - unsigned long address, struct page **hpage, > > + unsigned long address, bool *mmap_locked, > > struct collapse_control *cc) > > { > > pmd_t *pmd; > > pte_t *pte, *_pte; > > - int ret = 0, result = 0, referenced = 0; > > + int result = SCAN_FAIL, referenced = 0; > > int none_or_zero = 0, shared = 0; > > struct page *page = NULL; > > unsigned long _address; > > @@ -1306,19 +1300,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, > > result = SCAN_LACK_REFERENCED_PAGE; > > } else { > > result = SCAN_SUCCEED; > > - ret = 1; > > } > > out_unmap: > > pte_unmap_unlock(pte, ptl); > > - if (ret) { > > + if (result == SCAN_SUCCEED) { > > /* collapse_huge_page will return with the mmap_lock released */ > > - collapse_huge_page(mm, address, hpage, referenced, unmapped, > > - cc); > > + *mmap_locked = false; > > + result = collapse_huge_page(mm, address, referenced, > > + unmapped, cc); > > Shall move "*mmap_locked = false" after collapse_huge_page() call? No > functional change, but seems more consistent. Done. > > } > > out: > > trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, > > none_or_zero, result, unmapped); > > - return ret; > > + return result; > > } > > > > static void collect_mm_slot(struct mm_slot *mm_slot) > > @@ -1581,7 +1575,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > > * @mm: process address space where collapse happens > > * @file: file that collapse on > > * @start: collapse start address > > - * @hpage: new allocated huge page for collapse > > * @cc: collapse context and scratchpad > > * > > * Basic scheme is simple, details are more complex: > > @@ -1599,12 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > > * + restore gaps in the page cache; > > * + unlock and free huge page; > > */ > > -static void collapse_file(struct mm_struct *mm, struct file *file, > > - pgoff_t start, struct page **hpage, > > - struct collapse_control *cc) > > +static int collapse_file(struct mm_struct *mm, struct file *file, > > + pgoff_t start, struct collapse_control *cc) > > { > > struct address_space *mapping = file->f_mapping; > > - struct page *new_page; > > + struct page *hpage; > > pgoff_t index, end = start + HPAGE_PMD_NR; > > LIST_HEAD(pagelist); > > XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); > > @@ -1615,12 +1607,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); > > VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); > > > > - result = alloc_charge_hpage(mm, hpage, cc); > > + result = alloc_charge_hpage(mm, &hpage, cc); > > if (result != SCAN_SUCCEED) > > goto out; > > > > - new_page = *hpage; > > - > > /* > > * Ensure we have slots for all the pages in the range. This is > > * almost certainly a no-op because most of the pages must be present > > @@ -1637,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > } > > } while (1); > > > > - __SetPageLocked(new_page); > > + __SetPageLocked(hpage); > > if (is_shmem) > > - __SetPageSwapBacked(new_page); > > - new_page->index = start; > > - new_page->mapping = mapping; > > + __SetPageSwapBacked(hpage); > > + hpage->index = start; > > + hpage->mapping = mapping; > > > > /* > > - * At this point the new_page is locked and not up-to-date. > > + * At this point the hpage is locked and not up-to-date. > > * It's safe to insert it into the page cache, because nobody would > > * be able to map it or use it in another way until we unlock it. > > */ > > @@ -1672,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > result = SCAN_FAIL; > > goto xa_locked; > > } > > - xas_store(&xas, new_page); > > + xas_store(&xas, hpage); > > nr_none++; > > continue; > > } > > @@ -1814,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > list_add_tail(&page->lru, &pagelist); > > > > /* Finally, replace with the new page. */ > > - xas_store(&xas, new_page); > > + xas_store(&xas, hpage); > > continue; > > out_unlock: > > unlock_page(page); > > put_page(page); > > goto xa_unlocked; > > } > > - nr = thp_nr_pages(new_page); > > + nr = thp_nr_pages(hpage); > > > > if (is_shmem) > > - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); > > + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > > else { > > - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); > > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > > filemap_nr_thps_inc(mapping); > > /* > > * Paired with smp_mb() in do_dentry_open() to ensure > > @@ -1837,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > smp_mb(); > > if (inode_is_open_for_write(mapping->host)) { > > result = SCAN_FAIL; > > - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); > > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); > > filemap_nr_thps_dec(mapping); > > goto xa_locked; > > } > > } > > > > if (nr_none) { > > - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); > > + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > > if (is_shmem) > > - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); > > + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > > } > > > > /* Join all the small entries into a single multi-index entry */ > > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > > - xas_store(&xas, new_page); > > + xas_store(&xas, hpage); > > xa_locked: > > xas_unlock_irq(&xas); > > xa_unlocked: > > @@ -1873,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > index = start; > > list_for_each_entry_safe(page, tmp, &pagelist, lru) { > > while (index < page->index) { > > - clear_highpage(new_page + (index % HPAGE_PMD_NR)); > > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > > index++; > > } > > - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), > > - page); > > + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), > > + page); > > list_del(&page->lru); > > page->mapping = NULL; > > page_ref_unfreeze(page, 1); > > @@ -1888,23 +1878,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > index++; > > } > > while (index < end) { > > - clear_highpage(new_page + (index % HPAGE_PMD_NR)); > > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > > index++; > > } > > > > - SetPageUptodate(new_page); > > - page_ref_add(new_page, HPAGE_PMD_NR - 1); > > + SetPageUptodate(hpage); > > + page_ref_add(hpage, HPAGE_PMD_NR - 1); > > if (is_shmem) > > - set_page_dirty(new_page); > > - lru_cache_add(new_page); > > + set_page_dirty(hpage); > > + lru_cache_add(hpage); > > > > /* > > * Remove pte page tables, so we can re-fault the page as huge. > > */ > > retract_page_tables(mapping, start); > > - *hpage = NULL; > > - > > - khugepaged_pages_collapsed++; > > + unlock_page(hpage); > > + hpage = NULL; > > + goto out; > > Maybe just set hpage to NULL here, then later... > > > } else { > > struct page *page; > > > > @@ -1943,22 +1933,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file, > > VM_BUG_ON(nr_none); > > xas_unlock_irq(&xas); > > > > - new_page->mapping = NULL; > > + hpage->mapping = NULL; > > } > > > > - unlock_page(new_page); > > + unlock_page(hpage); > > do > if (hpage) > unlock_page(hpage); I don't have a strong preference here. Done. > > out: > > VM_BUG_ON(!list_empty(&pagelist)); > > - if (!IS_ERR_OR_NULL(*hpage)) { > > - mem_cgroup_uncharge(page_folio(*hpage)); > > - put_page(*hpage); > > + if (hpage) { > > + mem_cgroup_uncharge(page_folio(hpage)); > > + put_page(hpage); > > } > > /* TODO: tracepoints */ > > + return result; > > } > > > > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > > - pgoff_t start, struct page **hpage, > > - struct collapse_control *cc) > > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, > > + pgoff_t start, struct collapse_control *cc) > > { > > struct page *page = NULL; > > struct address_space *mapping = file->f_mapping; > > @@ -2031,15 +2021,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > > result = SCAN_EXCEED_NONE_PTE; > > count_vm_event(THP_SCAN_EXCEED_NONE_PTE); > > } else { > > - collapse_file(mm, file, start, hpage, cc); > > + result = collapse_file(mm, file, start, cc); > > } > > } > > > > /* TODO: tracepoints */ > > + return result; > > } > > #else > > -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, > > - pgoff_t start, struct collapse_control *cc) > > +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, > > + struct collapse_control *cc) > > { > > BUILD_BUG(); > > } > > @@ -2049,8 +2040,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) > > } > > #endif > > > > -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > > - struct page **hpage, > > +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > > struct collapse_control *cc) > > __releases(&khugepaged_mm_lock) > > __acquires(&khugepaged_mm_lock) > > @@ -2064,6 +2054,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > > > > VM_BUG_ON(!pages); > > lockdep_assert_held(&khugepaged_mm_lock); > > + *result = SCAN_FAIL; > > > > if (khugepaged_scan.mm_slot) > > mm_slot = khugepaged_scan.mm_slot; > > @@ -2117,7 +2108,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > > goto skip; > > > > while (khugepaged_scan.address < hend) { > > - int ret; > > + bool mmap_locked = true; > > + > > cond_resched(); > > if (unlikely(khugepaged_test_exit(mm))) > > goto breakouterloop; > > @@ -2134,20 +2126,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > > khugepaged_scan.address); > > > > mmap_read_unlock(mm); > > - ret = 1; > > - khugepaged_scan_file(mm, file, pgoff, hpage, > > - cc); > > + mmap_locked = false; > > + *result = khugepaged_scan_file(mm, file, pgoff, > > + cc); > > fput(file); > > } else { > > - ret = khugepaged_scan_pmd(mm, vma, > > - khugepaged_scan.address, > > - hpage, cc); > > + *result = khugepaged_scan_pmd(mm, vma, > > + khugepaged_scan.address, > > + &mmap_locked, cc); > > } > > + if (*result == SCAN_SUCCEED) > > + ++khugepaged_pages_collapsed; > > /* move to next address */ > > khugepaged_scan.address += HPAGE_PMD_SIZE; > > progress += HPAGE_PMD_NR; > > - if (ret) > > - /* we released mmap_lock so break loop */ > > + if (!mmap_locked) > > + /* > > + * We released mmap_lock so break loop. Note > > + * that we drop mmap_lock before all hugepage > > + * allocations, so if allocation fails, we are > > + * guaranteed to break here and report the > > + * correct result back to caller. > > + */ > > goto breakouterloop_mmap_lock; > > if (progress >= pages) > > goto breakouterloop; > > @@ -2199,15 +2199,15 @@ static int khugepaged_wait_event(void) > > > > static void khugepaged_do_scan(struct collapse_control *cc) > > { > > - struct page *hpage = NULL; > > unsigned int progress = 0, pass_through_head = 0; > > unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); > > bool wait = true; > > + int result = SCAN_SUCCEED; > > > > lru_add_drain_all(); > > > > while (progress < pages) { > > - if (alloc_fail_should_sleep(&hpage, &wait)) > > + if (alloc_fail_should_sleep(result, &wait)) > > break; > > > > cond_resched(); > > @@ -2221,7 +2221,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) > > if (khugepaged_has_work() && > > pass_through_head < 2) > > progress += khugepaged_scan_mm_slot(pages - progress, > > - &hpage, cc); > > + &result, cc); > > else > > progress = pages; > > spin_unlock(&khugepaged_mm_lock); > > -- > > 2.36.1.255.ge46751e96f-goog > >
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba722347bebd..03e0da0008f1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return SCAN_SUCCEED; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -850,14 +850,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc) #endif /* Sleep for the first alloc fail, break the loop for the second fail */ -static bool alloc_fail_should_sleep(struct page **hpage, bool *wait) +static bool alloc_fail_should_sleep(int result, bool *wait) { - if (IS_ERR(*hpage)) { + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { if (!*wait) return true; *wait = false; - *hpage = NULL; khugepaged_alloc_sleep(); } return false; @@ -868,7 +867,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); return false; } @@ -1010,17 +1008,17 @@ static int alloc_charge_hpage(struct mm_struct *mm, struct page **hpage, return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage, int referenced, - int unmapped, struct collapse_control *cc) +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; @@ -1034,12 +1032,10 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_read_unlock(mm); - result = alloc_charge_hpage(mm, hpage, cc); + result = alloc_charge_hpage(mm, &hpage, cc); if (result != SCAN_SUCCEED) goto out_nolock; - new_page = *hpage; - mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { @@ -1100,11 +1096,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1116,7 +1112,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1126,8 +1121,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1135,43 +1130,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) { - mem_cgroup_uncharge(page_folio(*hpage)); - put_page(*hpage); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); } - trace_mm_collapse_huge_page(mm, isolated, result); - return; + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, struct page **hpage, + unsigned long address, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1306,19 +1300,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { + if (result == SCAN_SUCCEED) { /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, referenced, unmapped, - cc); + *mmap_locked = false; + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1581,7 +1575,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @mm: process address space where collapse happens * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: @@ -1599,12 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct page **hpage, - struct collapse_control *cc) +static int collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1615,12 +1607,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - result = alloc_charge_hpage(mm, hpage, cc); + result = alloc_charge_hpage(mm, &hpage, cc); if (result != SCAN_SUCCEED) goto out; - new_page = *hpage; - /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present @@ -1637,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1672,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1814,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1837,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1873,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file, index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1888,23 +1878,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file, index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + unlock_page(hpage); + hpage = NULL; + goto out; } else { struct page *page; @@ -1943,22 +1933,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file, VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) { - mem_cgroup_uncharge(page_folio(*hpage)); - put_page(*hpage); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct page **hpage, - struct collapse_control *cc) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -2031,15 +2021,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - collapse_file(mm, file, start, hpage, cc); + result = collapse_file(mm, file, start, cc); } } /* TODO: tracepoints */ + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } @@ -2049,8 +2040,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage, +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -2064,6 +2054,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2117,7 +2108,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); if (unlikely(khugepaged_test_exit(mm))) goto breakouterloop; @@ -2134,20 +2126,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage, - cc); + mmap_locked = false; + *result = khugepaged_scan_file(mm, file, pgoff, + cc); fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage, cc); + *result = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, cc); } + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2199,15 +2199,15 @@ static int khugepaged_wait_event(void) static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); while (progress < pages) { - if (alloc_fail_should_sleep(&hpage, &wait)) + if (alloc_fail_should_sleep(result, &wait)) break; cond_resched(); @@ -2221,7 +2221,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage, cc); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock);
Pipe enum scan_result codes back through return values of functions downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to inform callers if the operation was successful, and if not, why. Since khugepaged_scan_pmd()'s return value already has a specific meaning (whether mmap_lock was unlocked or not), add a bool* argument to khugepaged_scan_pmd() to retrieve this information. Change khugepaged to take action based on the return values of khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting deep within the collapsing functions themselves. Remove dependency on error pointers to communicate to khugepaged that allocation failed and it should sleep; instead just use the result of the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails). Signed-off-by: Zach O'Keefe <zokeefe@google.com> --- mm/khugepaged.c | 192 ++++++++++++++++++++++++------------------------ 1 file changed, 96 insertions(+), 96 deletions(-)