Message ID | 20190711012528.176050-5-minchan@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce MADV_COLD and MADV_PAGEOUT | expand |
On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > return 0; > } > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > + unsigned long end, struct mm_walk *walk) > +{ > + struct mmu_gather *tlb = walk->private; > + struct mm_struct *mm = tlb->mm; > + struct vm_area_struct *vma = walk->vma; > + pte_t *orig_pte, *pte, ptent; > + spinlock_t *ptl; > + LIST_HEAD(page_list); > + struct page *page; > + unsigned long next; > + > + if (fatal_signal_pending(current)) > + return -EINTR; > + > + next = pmd_addr_end(addr, end); > + if (pmd_trans_huge(*pmd)) { > + pmd_t orig_pmd; > + > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (!ptl) > + return 0; > + > + orig_pmd = *pmd; > + if (is_huge_zero_pmd(orig_pmd)) > + goto huge_unlock; > + > + if (unlikely(!pmd_present(orig_pmd))) { > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(orig_pmd)); > + goto huge_unlock; > + } > + > + page = pmd_page(orig_pmd); > + if (next - addr != HPAGE_PMD_SIZE) { > + int err; > + > + if (page_mapcount(page) != 1) > + goto huge_unlock; > + get_page(page); > + spin_unlock(ptl); > + lock_page(page); > + err = split_huge_page(page); > + unlock_page(page); > + put_page(page); > + if (!err) > + goto regular_page; > + return 0; > + } > + > + if (isolate_lru_page(page)) > + goto huge_unlock; > + > + if (pmd_young(orig_pmd)) { > + pmdp_invalidate(vma, addr, pmd); > + orig_pmd = pmd_mkold(orig_pmd); > + > + set_pmd_at(mm, addr, pmd, orig_pmd); > + tlb_remove_tlb_entry(tlb, pmd, addr); > + } > + > + ClearPageReferenced(page); > + test_and_clear_page_young(page); > + list_add(&page->lru, &page_list); > +huge_unlock: > + spin_unlock(ptl); > + reclaim_pages(&page_list); > + return 0; > + } > + > + if (pmd_trans_unstable(pmd)) > + return 0; > +regular_page: > + tlb_change_page_size(tlb, PAGE_SIZE); > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > + flush_tlb_batched_pending(mm); > + arch_enter_lazy_mmu_mode(); > + for (; addr < end; pte++, addr += PAGE_SIZE) { > + ptent = *pte; > + if (!pte_present(ptent)) > + continue; > + > + page = vm_normal_page(vma, addr, ptent); > + if (!page) > + continue; > + > + /* > + * creating a THP page is expensive so split it only if we > + * are sure it's worth. Split it if we are only owner. > + */ > + if (PageTransCompound(page)) { > + if (page_mapcount(page) != 1) > + break; > + get_page(page); > + if (!trylock_page(page)) { > + put_page(page); > + break; > + } > + pte_unmap_unlock(orig_pte, ptl); > + if (split_huge_page(page)) { > + unlock_page(page); > + put_page(page); > + pte_offset_map_lock(mm, pmd, addr, &ptl); > + break; > + } > + unlock_page(page); > + put_page(page); > + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); > + pte--; > + addr -= PAGE_SIZE; > + continue; > + } > + > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > + > + if (isolate_lru_page(page)) > + continue; > + > + if (pte_young(ptent)) { > + ptent = ptep_get_and_clear_full(mm, addr, pte, > + tlb->fullmm); > + ptent = pte_mkold(ptent); > + set_pte_at(mm, addr, pte, ptent); > + tlb_remove_tlb_entry(tlb, pte, addr); > + } > + ClearPageReferenced(page); > + test_and_clear_page_young(page); > + list_add(&page->lru, &page_list); > + } > + > + arch_leave_lazy_mmu_mode(); > + pte_unmap_unlock(orig_pte, ptl); > + reclaim_pages(&page_list); > + cond_resched(); > + > + return 0; > +} I know you have briefly talked about code sharing already. While I agree that sharing with MADV_FREE is maybe a stretch, I applied these patches and compared the pageout and the cold page table functions, and they are line for line the same EXCEPT for 2-3 lines at the very end, where one reclaims and the other deactivates. It would be good to share here, it shouldn't be hard or result in fragile code. Something like int madvise_cold_or_pageout_range(..., bool pageout)?
Hi Johannes, On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > > return 0; > > } > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > + unsigned long end, struct mm_walk *walk) > > +{ > > + struct mmu_gather *tlb = walk->private; > > + struct mm_struct *mm = tlb->mm; > > + struct vm_area_struct *vma = walk->vma; > > + pte_t *orig_pte, *pte, ptent; > > + spinlock_t *ptl; > > + LIST_HEAD(page_list); > > + struct page *page; > > + unsigned long next; > > + > > + if (fatal_signal_pending(current)) > > + return -EINTR; > > + > > + next = pmd_addr_end(addr, end); > > + if (pmd_trans_huge(*pmd)) { > > + pmd_t orig_pmd; > > + > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > + ptl = pmd_trans_huge_lock(pmd, vma); > > + if (!ptl) > > + return 0; > > + > > + orig_pmd = *pmd; > > + if (is_huge_zero_pmd(orig_pmd)) > > + goto huge_unlock; > > + > > + if (unlikely(!pmd_present(orig_pmd))) { > > + VM_BUG_ON(thp_migration_supported() && > > + !is_pmd_migration_entry(orig_pmd)); > > + goto huge_unlock; > > + } > > + > > + page = pmd_page(orig_pmd); > > + if (next - addr != HPAGE_PMD_SIZE) { > > + int err; > > + > > + if (page_mapcount(page) != 1) > > + goto huge_unlock; > > + get_page(page); > > + spin_unlock(ptl); > > + lock_page(page); > > + err = split_huge_page(page); > > + unlock_page(page); > > + put_page(page); > > + if (!err) > > + goto regular_page; > > + return 0; > > + } > > + > > + if (isolate_lru_page(page)) > > + goto huge_unlock; > > + > > + if (pmd_young(orig_pmd)) { > > + pmdp_invalidate(vma, addr, pmd); > > + orig_pmd = pmd_mkold(orig_pmd); > > + > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > + } > > + > > + ClearPageReferenced(page); > > + test_and_clear_page_young(page); > > + list_add(&page->lru, &page_list); > > +huge_unlock: > > + spin_unlock(ptl); > > + reclaim_pages(&page_list); > > + return 0; > > + } > > + > > + if (pmd_trans_unstable(pmd)) > > + return 0; > > +regular_page: > > + tlb_change_page_size(tlb, PAGE_SIZE); > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > + flush_tlb_batched_pending(mm); > > + arch_enter_lazy_mmu_mode(); > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > + ptent = *pte; > > + if (!pte_present(ptent)) > > + continue; > > + > > + page = vm_normal_page(vma, addr, ptent); > > + if (!page) > > + continue; > > + > > + /* > > + * creating a THP page is expensive so split it only if we > > + * are sure it's worth. Split it if we are only owner. > > + */ > > + if (PageTransCompound(page)) { > > + if (page_mapcount(page) != 1) > > + break; > > + get_page(page); > > + if (!trylock_page(page)) { > > + put_page(page); > > + break; > > + } > > + pte_unmap_unlock(orig_pte, ptl); > > + if (split_huge_page(page)) { > > + unlock_page(page); > > + put_page(page); > > + pte_offset_map_lock(mm, pmd, addr, &ptl); > > + break; > > + } > > + unlock_page(page); > > + put_page(page); > > + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); > > + pte--; > > + addr -= PAGE_SIZE; > > + continue; > > + } > > + > > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > > + > > + if (isolate_lru_page(page)) > > + continue; > > + > > + if (pte_young(ptent)) { > > + ptent = ptep_get_and_clear_full(mm, addr, pte, > > + tlb->fullmm); > > + ptent = pte_mkold(ptent); > > + set_pte_at(mm, addr, pte, ptent); > > + tlb_remove_tlb_entry(tlb, pte, addr); > > + } > > + ClearPageReferenced(page); > > + test_and_clear_page_young(page); > > + list_add(&page->lru, &page_list); > > + } > > + > > + arch_leave_lazy_mmu_mode(); > > + pte_unmap_unlock(orig_pte, ptl); > > + reclaim_pages(&page_list); > > + cond_resched(); > > + > > + return 0; > > +} > > I know you have briefly talked about code sharing already. > > While I agree that sharing with MADV_FREE is maybe a stretch, I > applied these patches and compared the pageout and the cold page table > functions, and they are line for line the same EXCEPT for 2-3 lines at > the very end, where one reclaims and the other deactivates. It would > be good to share here, it shouldn't be hard or result in fragile code. Fair enough if we leave MADV_FREE. > > Something like int madvise_cold_or_pageout_range(..., bool pageout)? How about this? From 41592f23e876ec21e49dc3c76dc89538e2bb16be Mon Sep 17 00:00:00 2001 From: Minchan Kim <minchan@kernel.org> Date: Fri, 12 Jul 2019 14:05:36 +0900 Subject: [PATCH] mm: factor out common parts between MADV_COLD and MADV_PAGEOUT There are many common parts between MADV_COLD and MADV_PAGEOUT. This patch factor them out to save code duplication. Signed-off-by: Minchan Kim <minchan@kernel.org> --- mm/madvise.c | 201 +++++++++++++-------------------------------------- 1 file changed, 52 insertions(+), 149 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index bc2f0138982e..3d3d14517cc8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -30,6 +30,11 @@ #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -310,16 +315,23 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } -static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - struct mmu_gather *tlb = walk->private; + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; spinlock_t *ptl; - struct page *page; unsigned long next; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { @@ -358,6 +370,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, return 0; } + if (pageout) { + if (isolate_lru_page(page)) + goto huge_unlock; + list_add(&page->lru, &page_list); + } + if (pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -366,10 +384,14 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); huge_unlock: spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); + else + deactivate_page(page); return 0; } @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, VM_BUG_ON_PAGE(PageTransCompound(page), page); + if (pageout) { + if (isolate_lru_page(page)) + continue; + list_add(&page->lru, &page_list); + } + if (pte_young(ptent)) { ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -437,12 +465,16 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); + if (!pageout) + deactivate_page(page); } arch_enter_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); cond_resched(); return 0; @@ -452,10 +484,15 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .tlb = tlb, + .pageout = false, + }; + struct mm_walk cold_walk = { - .pmd_entry = madvise_cold_pte_range, + .pmd_entry = madvise_cold_or_pageout_pte_range, .mm = vma->vm_mm, - .private = tlb, + .private = &walk_private, }; tlb_start_vma(tlb, vma); @@ -482,153 +519,19 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } -static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct mmu_gather *tlb = walk->private; - struct mm_struct *mm = tlb->mm; - struct vm_area_struct *vma = walk->vma; - pte_t *orig_pte, *pte, ptent; - spinlock_t *ptl; - LIST_HEAD(page_list); - struct page *page; - unsigned long next; - - if (fatal_signal_pending(current)) - return -EINTR; - - next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { - pmd_t orig_pmd; - - tlb_change_page_size(tlb, HPAGE_PMD_SIZE); - ptl = pmd_trans_huge_lock(pmd, vma); - if (!ptl) - return 0; - - orig_pmd = *pmd; - if (is_huge_zero_pmd(orig_pmd)) - goto huge_unlock; - - if (unlikely(!pmd_present(orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - goto huge_unlock; - } - - page = pmd_page(orig_pmd); - if (next - addr != HPAGE_PMD_SIZE) { - int err; - - if (page_mapcount(page) != 1) - goto huge_unlock; - get_page(page); - spin_unlock(ptl); - lock_page(page); - err = split_huge_page(page); - unlock_page(page); - put_page(page); - if (!err) - goto regular_page; - return 0; - } - - if (isolate_lru_page(page)) - goto huge_unlock; - - if (pmd_young(orig_pmd)) { - pmdp_invalidate(vma, addr, pmd); - orig_pmd = pmd_mkold(orig_pmd); - - set_pmd_at(mm, addr, pmd, orig_pmd); - tlb_remove_tlb_entry(tlb, pmd, addr); - } - - ClearPageReferenced(page); - test_and_clear_page_young(page); - list_add(&page->lru, &page_list); -huge_unlock: - spin_unlock(ptl); - reclaim_pages(&page_list); - return 0; - } - - if (pmd_trans_unstable(pmd)) - return 0; -regular_page: - tlb_change_page_size(tlb, PAGE_SIZE); - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - if (!pte_present(ptent)) - continue; - - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; - - /* - * creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. - */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) - break; - get_page(page); - if (!trylock_page(page)) { - put_page(page); - break; - } - pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); - pte_offset_map_lock(mm, pmd, addr, &ptl); - break; - } - unlock_page(page); - put_page(page); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - pte--; - addr -= PAGE_SIZE; - continue; - } - - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (isolate_lru_page(page)) - continue; - - if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - } - ClearPageReferenced(page); - test_and_clear_page_young(page); - list_add(&page->lru, &page_list); - } - - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(orig_pte, ptl); - reclaim_pages(&page_list); - cond_resched(); - - return 0; -} - static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + struct mm_walk pageout_walk = { - .pmd_entry = madvise_pageout_pte_range, + .pmd_entry = madvise_cold_or_pageout_pte_range, .mm = vma->vm_mm, - .private = tlb, + .private = &walk_private, }; tlb_start_vma(tlb, vma);
On Fri 12-07-19 14:18:28, Minchan Kim wrote: [...] > >From 41592f23e876ec21e49dc3c76dc89538e2bb16be Mon Sep 17 00:00:00 2001 > From: Minchan Kim <minchan@kernel.org> > Date: Fri, 12 Jul 2019 14:05:36 +0900 > Subject: [PATCH] mm: factor out common parts between MADV_COLD and > MADV_PAGEOUT > > There are many common parts between MADV_COLD and MADV_PAGEOUT. > This patch factor them out to save code duplication. This looks better indeed. I still hope that this can get improved even further but let's do that in a follow up patch. > Signed-off-by: Minchan Kim <minchan@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> > --- > mm/madvise.c | 201 +++++++++++++-------------------------------------- > 1 file changed, 52 insertions(+), 149 deletions(-) > > diff --git a/mm/madvise.c b/mm/madvise.c > index bc2f0138982e..3d3d14517cc8 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -30,6 +30,11 @@ > > #include "internal.h" > > +struct madvise_walk_private { > + struct mmu_gather *tlb; > + bool pageout; > +}; > + > /* > * Any behaviour which results in changes to the vma->vm_flags needs to > * take mmap_sem for writing. Others, which simply traverse vmas, need > @@ -310,16 +315,23 @@ static long madvise_willneed(struct vm_area_struct *vma, > return 0; > } > > -static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > - unsigned long end, struct mm_walk *walk) > +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > + unsigned long addr, unsigned long end, > + struct mm_walk *walk) > { > - struct mmu_gather *tlb = walk->private; > + struct madvise_walk_private *private = walk->private; > + struct mmu_gather *tlb = private->tlb; > + bool pageout = private->pageout; > struct mm_struct *mm = tlb->mm; > struct vm_area_struct *vma = walk->vma; > pte_t *orig_pte, *pte, ptent; > spinlock_t *ptl; > - struct page *page; > unsigned long next; > + struct page *page = NULL; > + LIST_HEAD(page_list); > + > + if (fatal_signal_pending(current)) > + return -EINTR; > > next = pmd_addr_end(addr, end); > if (pmd_trans_huge(*pmd)) { > @@ -358,6 +370,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > return 0; > } > > + if (pageout) { > + if (isolate_lru_page(page)) > + goto huge_unlock; > + list_add(&page->lru, &page_list); > + } > + > if (pmd_young(orig_pmd)) { > pmdp_invalidate(vma, addr, pmd); > orig_pmd = pmd_mkold(orig_pmd); > @@ -366,10 +384,14 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > tlb_remove_pmd_tlb_entry(tlb, pmd, addr); > } > > + ClearPageReferenced(page); > test_and_clear_page_young(page); > - deactivate_page(page); > huge_unlock: > spin_unlock(ptl); > + if (pageout) > + reclaim_pages(&page_list); > + else > + deactivate_page(page); > return 0; > } > > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > + if (pageout) { > + if (isolate_lru_page(page)) > + continue; > + list_add(&page->lru, &page_list); > + } > + > if (pte_young(ptent)) { > ptent = ptep_get_and_clear_full(mm, addr, pte, > tlb->fullmm); > @@ -437,12 +465,16 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > * As a side effect, it makes confuse idle-page tracking > * because they will miss recent referenced history. > */ > + ClearPageReferenced(page); > test_and_clear_page_young(page); > - deactivate_page(page); > + if (!pageout) > + deactivate_page(page); > } > > arch_enter_lazy_mmu_mode(); > pte_unmap_unlock(orig_pte, ptl); > + if (pageout) > + reclaim_pages(&page_list); > cond_resched(); > > return 0; > @@ -452,10 +484,15 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > unsigned long addr, unsigned long end) > { > + struct madvise_walk_private walk_private = { > + .tlb = tlb, > + .pageout = false, > + }; > + > struct mm_walk cold_walk = { > - .pmd_entry = madvise_cold_pte_range, > + .pmd_entry = madvise_cold_or_pageout_pte_range, > .mm = vma->vm_mm, > - .private = tlb, > + .private = &walk_private, > }; > > tlb_start_vma(tlb, vma); > @@ -482,153 +519,19 @@ static long madvise_cold(struct vm_area_struct *vma, > return 0; > } > > -static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > - unsigned long end, struct mm_walk *walk) > -{ > - struct mmu_gather *tlb = walk->private; > - struct mm_struct *mm = tlb->mm; > - struct vm_area_struct *vma = walk->vma; > - pte_t *orig_pte, *pte, ptent; > - spinlock_t *ptl; > - LIST_HEAD(page_list); > - struct page *page; > - unsigned long next; > - > - if (fatal_signal_pending(current)) > - return -EINTR; > - > - next = pmd_addr_end(addr, end); > - if (pmd_trans_huge(*pmd)) { > - pmd_t orig_pmd; > - > - tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > - ptl = pmd_trans_huge_lock(pmd, vma); > - if (!ptl) > - return 0; > - > - orig_pmd = *pmd; > - if (is_huge_zero_pmd(orig_pmd)) > - goto huge_unlock; > - > - if (unlikely(!pmd_present(orig_pmd))) { > - VM_BUG_ON(thp_migration_supported() && > - !is_pmd_migration_entry(orig_pmd)); > - goto huge_unlock; > - } > - > - page = pmd_page(orig_pmd); > - if (next - addr != HPAGE_PMD_SIZE) { > - int err; > - > - if (page_mapcount(page) != 1) > - goto huge_unlock; > - get_page(page); > - spin_unlock(ptl); > - lock_page(page); > - err = split_huge_page(page); > - unlock_page(page); > - put_page(page); > - if (!err) > - goto regular_page; > - return 0; > - } > - > - if (isolate_lru_page(page)) > - goto huge_unlock; > - > - if (pmd_young(orig_pmd)) { > - pmdp_invalidate(vma, addr, pmd); > - orig_pmd = pmd_mkold(orig_pmd); > - > - set_pmd_at(mm, addr, pmd, orig_pmd); > - tlb_remove_tlb_entry(tlb, pmd, addr); > - } > - > - ClearPageReferenced(page); > - test_and_clear_page_young(page); > - list_add(&page->lru, &page_list); > -huge_unlock: > - spin_unlock(ptl); > - reclaim_pages(&page_list); > - return 0; > - } > - > - if (pmd_trans_unstable(pmd)) > - return 0; > -regular_page: > - tlb_change_page_size(tlb, PAGE_SIZE); > - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > - flush_tlb_batched_pending(mm); > - arch_enter_lazy_mmu_mode(); > - for (; addr < end; pte++, addr += PAGE_SIZE) { > - ptent = *pte; > - if (!pte_present(ptent)) > - continue; > - > - page = vm_normal_page(vma, addr, ptent); > - if (!page) > - continue; > - > - /* > - * creating a THP page is expensive so split it only if we > - * are sure it's worth. Split it if we are only owner. > - */ > - if (PageTransCompound(page)) { > - if (page_mapcount(page) != 1) > - break; > - get_page(page); > - if (!trylock_page(page)) { > - put_page(page); > - break; > - } > - pte_unmap_unlock(orig_pte, ptl); > - if (split_huge_page(page)) { > - unlock_page(page); > - put_page(page); > - pte_offset_map_lock(mm, pmd, addr, &ptl); > - break; > - } > - unlock_page(page); > - put_page(page); > - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); > - pte--; > - addr -= PAGE_SIZE; > - continue; > - } > - > - VM_BUG_ON_PAGE(PageTransCompound(page), page); > - > - if (isolate_lru_page(page)) > - continue; > - > - if (pte_young(ptent)) { > - ptent = ptep_get_and_clear_full(mm, addr, pte, > - tlb->fullmm); > - ptent = pte_mkold(ptent); > - set_pte_at(mm, addr, pte, ptent); > - tlb_remove_tlb_entry(tlb, pte, addr); > - } > - ClearPageReferenced(page); > - test_and_clear_page_young(page); > - list_add(&page->lru, &page_list); > - } > - > - arch_leave_lazy_mmu_mode(); > - pte_unmap_unlock(orig_pte, ptl); > - reclaim_pages(&page_list); > - cond_resched(); > - > - return 0; > -} > - > static void madvise_pageout_page_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > unsigned long addr, unsigned long end) > { > + struct madvise_walk_private walk_private = { > + .pageout = true, > + .tlb = tlb, > + }; > + > struct mm_walk pageout_walk = { > - .pmd_entry = madvise_pageout_pte_range, > + .pmd_entry = madvise_cold_or_pageout_pte_range, > .mm = vma->vm_mm, > - .private = tlb, > + .private = &walk_private, > }; > > tlb_start_vma(tlb, vma); > -- > 2.22.0.410.gd8fdbe21b5-goog
On Fri, Jul 12, 2019 at 02:18:28PM +0900, Minchan Kim wrote: > Hi Johannes, > > On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > > > return 0; > > > } > > > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > > + unsigned long end, struct mm_walk *walk) > > > +{ > > > + struct mmu_gather *tlb = walk->private; > > > + struct mm_struct *mm = tlb->mm; > > > + struct vm_area_struct *vma = walk->vma; > > > + pte_t *orig_pte, *pte, ptent; > > > + spinlock_t *ptl; > > > + LIST_HEAD(page_list); > > > + struct page *page; > > > + unsigned long next; > > > + > > > + if (fatal_signal_pending(current)) > > > + return -EINTR; > > > + > > > + next = pmd_addr_end(addr, end); > > > + if (pmd_trans_huge(*pmd)) { > > > + pmd_t orig_pmd; > > > + > > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > > + ptl = pmd_trans_huge_lock(pmd, vma); > > > + if (!ptl) > > > + return 0; > > > + > > > + orig_pmd = *pmd; > > > + if (is_huge_zero_pmd(orig_pmd)) > > > + goto huge_unlock; > > > + > > > + if (unlikely(!pmd_present(orig_pmd))) { > > > + VM_BUG_ON(thp_migration_supported() && > > > + !is_pmd_migration_entry(orig_pmd)); > > > + goto huge_unlock; > > > + } > > > + > > > + page = pmd_page(orig_pmd); > > > + if (next - addr != HPAGE_PMD_SIZE) { > > > + int err; > > > + > > > + if (page_mapcount(page) != 1) > > > + goto huge_unlock; > > > + get_page(page); > > > + spin_unlock(ptl); > > > + lock_page(page); > > > + err = split_huge_page(page); > > > + unlock_page(page); > > > + put_page(page); > > > + if (!err) > > > + goto regular_page; > > > + return 0; > > > + } > > > + > > > + if (isolate_lru_page(page)) > > > + goto huge_unlock; > > > + > > > + if (pmd_young(orig_pmd)) { > > > + pmdp_invalidate(vma, addr, pmd); > > > + orig_pmd = pmd_mkold(orig_pmd); > > > + > > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > > + } > > > + > > > + ClearPageReferenced(page); > > > + test_and_clear_page_young(page); > > > + list_add(&page->lru, &page_list); > > > +huge_unlock: > > > + spin_unlock(ptl); > > > + reclaim_pages(&page_list); > > > + return 0; > > > + } > > > + > > > + if (pmd_trans_unstable(pmd)) > > > + return 0; > > > +regular_page: > > > + tlb_change_page_size(tlb, PAGE_SIZE); > > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > > + flush_tlb_batched_pending(mm); > > > + arch_enter_lazy_mmu_mode(); > > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > > + ptent = *pte; > > > + if (!pte_present(ptent)) > > > + continue; > > > + > > > + page = vm_normal_page(vma, addr, ptent); > > > + if (!page) > > > + continue; > > > + > > > + /* > > > + * creating a THP page is expensive so split it only if we > > > + * are sure it's worth. Split it if we are only owner. > > > + */ > > > + if (PageTransCompound(page)) { > > > + if (page_mapcount(page) != 1) > > > + break; > > > + get_page(page); > > > + if (!trylock_page(page)) { > > > + put_page(page); > > > + break; > > > + } > > > + pte_unmap_unlock(orig_pte, ptl); > > > + if (split_huge_page(page)) { > > > + unlock_page(page); > > > + put_page(page); > > > + pte_offset_map_lock(mm, pmd, addr, &ptl); > > > + break; > > > + } > > > + unlock_page(page); > > > + put_page(page); > > > + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); > > > + pte--; > > > + addr -= PAGE_SIZE; > > > + continue; > > > + } > > > + > > > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > + > > > + if (isolate_lru_page(page)) > > > + continue; > > > + > > > + if (pte_young(ptent)) { > > > + ptent = ptep_get_and_clear_full(mm, addr, pte, > > > + tlb->fullmm); > > > + ptent = pte_mkold(ptent); > > > + set_pte_at(mm, addr, pte, ptent); > > > + tlb_remove_tlb_entry(tlb, pte, addr); > > > + } > > > + ClearPageReferenced(page); > > > + test_and_clear_page_young(page); > > > + list_add(&page->lru, &page_list); > > > + } > > > + > > > + arch_leave_lazy_mmu_mode(); > > > + pte_unmap_unlock(orig_pte, ptl); > > > + reclaim_pages(&page_list); > > > + cond_resched(); > > > + > > > + return 0; > > > +} > > > > I know you have briefly talked about code sharing already. > > > > While I agree that sharing with MADV_FREE is maybe a stretch, I > > applied these patches and compared the pageout and the cold page table > > functions, and they are line for line the same EXCEPT for 2-3 lines at > > the very end, where one reclaims and the other deactivates. It would > > be good to share here, it shouldn't be hard or result in fragile code. > > Fair enough if we leave MADV_FREE. > > > > > Something like int madvise_cold_or_pageout_range(..., bool pageout)? > > How about this? > > From 41592f23e876ec21e49dc3c76dc89538e2bb16be Mon Sep 17 00:00:00 2001 > From: Minchan Kim <minchan@kernel.org> > Date: Fri, 12 Jul 2019 14:05:36 +0900 > Subject: [PATCH] mm: factor out common parts between MADV_COLD and > MADV_PAGEOUT > > There are many common parts between MADV_COLD and MADV_PAGEOUT. > This patch factor them out to save code duplication. > > Signed-off-by: Minchan Kim <minchan@kernel.org> This looks much better, thanks! > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > + if (pageout) { > + if (isolate_lru_page(page)) > + continue; > + list_add(&page->lru, &page_list); > + } > + > if (pte_young(ptent)) { > ptent = ptep_get_and_clear_full(mm, addr, pte, > tlb->fullmm); One thought on the ordering here. When LRU isolation fails, it would still make sense to clear the young bit: we cannot reclaim the page as we wanted to, but the user still provided a clear hint that the page is cold and she won't be touching it for a while. MADV_PAGEOUT is basically MADV_COLD + try_to_reclaim. So IMO isolation should go to the end next to deactivate_page().
On Fri 12-07-19 09:58:09, Johannes Weiner wrote: [...] > > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > > + if (pageout) { > > + if (isolate_lru_page(page)) > > + continue; > > + list_add(&page->lru, &page_list); > > + } > > + > > if (pte_young(ptent)) { > > ptent = ptep_get_and_clear_full(mm, addr, pte, > > tlb->fullmm); > > One thought on the ordering here. > > When LRU isolation fails, it would still make sense to clear the young > bit: we cannot reclaim the page as we wanted to, but the user still > provided a clear hint that the page is cold and she won't be touching > it for a while. MADV_PAGEOUT is basically MADV_COLD + try_to_reclaim. > So IMO isolation should go to the end next to deactivate_page(). Make sense to me
On Fri, Jul 12, 2019 at 09:58:09AM -0400, Johannes Weiner wrote: > On Fri, Jul 12, 2019 at 02:18:28PM +0900, Minchan Kim wrote: > > Hi Johannes, > > > > On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > > > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > > > > return 0; > > > > } > > > > > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > > > + unsigned long end, struct mm_walk *walk) > > > > +{ > > > > + struct mmu_gather *tlb = walk->private; > > > > + struct mm_struct *mm = tlb->mm; > > > > + struct vm_area_struct *vma = walk->vma; > > > > + pte_t *orig_pte, *pte, ptent; > > > > + spinlock_t *ptl; > > > > + LIST_HEAD(page_list); > > > > + struct page *page; > > > > + unsigned long next; > > > > + > > > > + if (fatal_signal_pending(current)) > > > > + return -EINTR; > > > > + > > > > + next = pmd_addr_end(addr, end); > > > > + if (pmd_trans_huge(*pmd)) { > > > > + pmd_t orig_pmd; > > > > + > > > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > > > + ptl = pmd_trans_huge_lock(pmd, vma); > > > > + if (!ptl) > > > > + return 0; > > > > + > > > > + orig_pmd = *pmd; > > > > + if (is_huge_zero_pmd(orig_pmd)) > > > > + goto huge_unlock; > > > > + > > > > + if (unlikely(!pmd_present(orig_pmd))) { > > > > + VM_BUG_ON(thp_migration_supported() && > > > > + !is_pmd_migration_entry(orig_pmd)); > > > > + goto huge_unlock; > > > > + } > > > > + > > > > + page = pmd_page(orig_pmd); > > > > + if (next - addr != HPAGE_PMD_SIZE) { > > > > + int err; > > > > + > > > > + if (page_mapcount(page) != 1) > > > > + goto huge_unlock; > > > > + get_page(page); > > > > + spin_unlock(ptl); > > > > + lock_page(page); > > > > + err = split_huge_page(page); > > > > + unlock_page(page); > > > > + put_page(page); > > > > + if (!err) > > > > + goto regular_page; > > > > + return 0; > > > > + } > > > > + > > > > + if (isolate_lru_page(page)) > > > > + goto huge_unlock; > > > > + > > > > + if (pmd_young(orig_pmd)) { > > > > + pmdp_invalidate(vma, addr, pmd); > > > > + orig_pmd = pmd_mkold(orig_pmd); > > > > + > > > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > > > + } > > > > + > > > > + ClearPageReferenced(page); > > > > + test_and_clear_page_young(page); > > > > + list_add(&page->lru, &page_list); > > > > +huge_unlock: > > > > + spin_unlock(ptl); > > > > + reclaim_pages(&page_list); > > > > + return 0; > > > > + } > > > > + > > > > + if (pmd_trans_unstable(pmd)) > > > > + return 0; > > > > +regular_page: > > > > + tlb_change_page_size(tlb, PAGE_SIZE); > > > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > > > + flush_tlb_batched_pending(mm); > > > > + arch_enter_lazy_mmu_mode(); > > > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > > > + ptent = *pte; > > > > + if (!pte_present(ptent)) > > > > + continue; > > > > + > > > > + page = vm_normal_page(vma, addr, ptent); > > > > + if (!page) > > > > + continue; > > > > + > > > > + /* > > > > + * creating a THP page is expensive so split it only if we > > > > + * are sure it's worth. Split it if we are only owner. > > > > + */ > > > > + if (PageTransCompound(page)) { > > > > + if (page_mapcount(page) != 1) > > > > + break; > > > > + get_page(page); > > > > + if (!trylock_page(page)) { > > > > + put_page(page); > > > > + break; > > > > + } > > > > + pte_unmap_unlock(orig_pte, ptl); > > > > + if (split_huge_page(page)) { > > > > + unlock_page(page); > > > > + put_page(page); > > > > + pte_offset_map_lock(mm, pmd, addr, &ptl); > > > > + break; > > > > + } > > > > + unlock_page(page); > > > > + put_page(page); > > > > + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); > > > > + pte--; > > > > + addr -= PAGE_SIZE; > > > > + continue; > > > > + } > > > > + > > > > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > > + > > > > + if (isolate_lru_page(page)) > > > > + continue; > > > > + > > > > + if (pte_young(ptent)) { > > > > + ptent = ptep_get_and_clear_full(mm, addr, pte, > > > > + tlb->fullmm); > > > > + ptent = pte_mkold(ptent); > > > > + set_pte_at(mm, addr, pte, ptent); > > > > + tlb_remove_tlb_entry(tlb, pte, addr); > > > > + } > > > > + ClearPageReferenced(page); > > > > + test_and_clear_page_young(page); > > > > + list_add(&page->lru, &page_list); > > > > + } > > > > + > > > > + arch_leave_lazy_mmu_mode(); > > > > + pte_unmap_unlock(orig_pte, ptl); > > > > + reclaim_pages(&page_list); > > > > + cond_resched(); > > > > + > > > > + return 0; > > > > +} > > > > > > I know you have briefly talked about code sharing already. > > > > > > While I agree that sharing with MADV_FREE is maybe a stretch, I > > > applied these patches and compared the pageout and the cold page table > > > functions, and they are line for line the same EXCEPT for 2-3 lines at > > > the very end, where one reclaims and the other deactivates. It would > > > be good to share here, it shouldn't be hard or result in fragile code. > > > > Fair enough if we leave MADV_FREE. > > > > > > > > Something like int madvise_cold_or_pageout_range(..., bool pageout)? > > > > How about this? > > > > From 41592f23e876ec21e49dc3c76dc89538e2bb16be Mon Sep 17 00:00:00 2001 > > From: Minchan Kim <minchan@kernel.org> > > Date: Fri, 12 Jul 2019 14:05:36 +0900 > > Subject: [PATCH] mm: factor out common parts between MADV_COLD and > > MADV_PAGEOUT > > > > There are many common parts between MADV_COLD and MADV_PAGEOUT. > > This patch factor them out to save code duplication. > > > > Signed-off-by: Minchan Kim <minchan@kernel.org> > > This looks much better, thanks! > > > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > > + if (pageout) { > > + if (isolate_lru_page(page)) > > + continue; > > + list_add(&page->lru, &page_list); > > + } > > + > > if (pte_young(ptent)) { > > ptent = ptep_get_and_clear_full(mm, addr, pte, > > tlb->fullmm); > > One thought on the ordering here. > > When LRU isolation fails, it would still make sense to clear the young > bit: we cannot reclaim the page as we wanted to, but the user still > provided a clear hint that the page is cold and she won't be touching > it for a while. MADV_PAGEOUT is basically MADV_COLD + try_to_reclaim. > So IMO isolation should go to the end next to deactivate_page(). Sure, I will modify MADV_PAGEOUT patch instead of refactoring one. Thanks for the review, Johannes!
diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ce997edb8bb..063c0c1e112b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -365,6 +365,7 @@ extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; +extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ef8a56927b12..c613abdb7284 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -46,6 +46,7 @@ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ #define MADV_COLD 5 /* deactivatie these pages */ +#define MADV_PAGEOUT 6 /* reclaim these pages */ /* common parameters: try to keep these consistent across architectures */ #define MADV_FREE 8 /* free pages only if memory pressure */ diff --git a/mm/madvise.c b/mm/madvise.c index bae0055f9724..bc2f0138982e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> +#include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> @@ -41,6 +42,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + LIST_HEAD(page_list); + struct page *page; + unsigned long next; + + if (fatal_signal_pending(current)) + return -EINTR; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (isolate_lru_page(page)) + goto huge_unlock; + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + list_add(&page->lru, &page_list); +huge_unlock: + spin_unlock(ptl); + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (isolate_lru_page(page)) + continue; + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + ClearPageReferenced(page); + test_and_clear_page_young(page); + list_add(&page->lru, &page_list); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_walk pageout_walk = { + .pmd_entry = madvise_pageout_pte_range, + .mm = vma->vm_mm, + .private = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &pageout_walk); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -870,6 +1064,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -892,6 +1088,7 @@ madvise_behavior_valid(int behavior) case MADV_DONTNEED: case MADV_FREE: case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/vmscan.c b/mm/vmscan.c index ca192b792d4f..bda3c41de767 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2153,6 +2153,61 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work.