@@ -1784,7 +1784,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
if (!pte_present(pte))
return NULL;
- page = vm_normal_page(vma, addr, pte);
+ page = vm_normal_lru_page(vma, addr, pte);
if (!page)
return NULL;
@@ -593,8 +593,8 @@ struct vm_operations_struct {
unsigned long addr);
#endif
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
+ * Called by vm_normal_*_page() for special PTEs to find the
+ * page for @addr. This is useful if the default behavior
* (using pte_page()) would not find the correct page.
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -1783,6 +1783,8 @@ extern void user_shm_unlock(size_t, struct ucounts *);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
+struct page *vm_normal_lru_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -2901,6 +2903,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_LRU 0x100000 /* return only LRU (anon or page cache) */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
@@ -3227,7 +3230,7 @@ extern long copy_huge_page_from_user(struct page *dst_page,
* @vma: Pointer to the struct vm_area_struct to consider
*
* Whether transhuge page-table entries are considered "special" following
- * the definition in vm_normal_page().
+ * the definition in vm_normal_*_page().
*
* Return: true if transhuge page-table entries should be considered special,
* false otherwise.
@@ -539,8 +539,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return NULL;
}
-
- page = vm_normal_page(vma, address, pte);
+ if (flags & (FOLL_MLOCK | FOLL_LRU))
+ page = vm_normal_lru_page(vma, address, pte);
+ else
+ page = vm_normal_page(vma, address, pte);
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -824,7 +826,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
+ * by a page descriptor (see also vm_normal_*_page()).
*/
static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
@@ -2966,7 +2966,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
}
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
+ follflags = FOLL_GET | FOLL_DUMP | FOLL_LRU;
page = follow_page(vma, addr, follflags);
if (IS_ERR(page))
@@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_NON_PRESENT;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
+ page = vm_normal_lru_page(vma, address, pteval);
if (unlikely(!page)) {
result = SCAN_PAGE_NULL;
goto out;
@@ -1286,7 +1286,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pte_write(pteval))
writable = true;
- page = vm_normal_page(vma, _address, pteval);
+ page = vm_normal_lru_page(vma, _address, pteval);
if (unlikely(!page)) {
result = SCAN_PAGE_NULL;
goto out_unmap;
@@ -1494,7 +1494,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!pte_present(*pte))
goto abort;
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_lru_page(vma, addr, *pte);
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
@@ -1512,7 +1512,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_lru_page(vma, addr, *pte);
page_remove_rmap(page, false);
}
@@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
@@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
if (!vma)
goto out;
- page = follow_page(vma, addr, FOLL_GET);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
goto out;
if (PageAnon(page)) {
@@ -411,7 +411,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
+ page = vm_normal_lru_page(vma, addr, ptent);
if (!page)
continue;
@@ -621,7 +621,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = vm_normal_page(vma, addr, ptent);
+ page = vm_normal_lru_page(vma, addr, ptent);
if (!page)
continue;
@@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
+/*
+ * NOTE: Technically this should goto check_pfn label. However, page->_mapcount
+ * is never incremented for device pages that are mmap through DAX mechanism
+ * using pmem driver mounted into ext4 filesystem. When these pages are unmap,
+ * zap_pte_range is called and vm_normal_page return a valid page with
+ * page_mapcount() = 0, before page_remove_rmap is called.
+ */
return NULL;
print_bad_pte(vma, addr, pte, NULL);
@@ -661,6 +668,35 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_lru_page - gets LRU handled page associated with a pte.
+ *
+ * @vma: user vma the page belongs to.
+ * @addr: user address the page belongs to.
+ * @pte: page table entry associated to the page.
+ *
+ * This function gets the "struct page" associated with a pte, only for pages
+ * that can be put on an LRU list and that support NUMA migration, KSM and
+ * THP.
+ *
+ * With DEVICE_COHERENT introduction, vm_normal_pages() could return
+ * device-managed anonymous pages that are not LRU pages. This
+ * vm_normal_lru_page function, makes sure to return LRU handled pages only.
+ *
+ * Return: "struct page" reference associated with the pte.
+ */
+struct page *vm_normal_lru_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (page && is_zone_device_page(page))
+ return NULL;
+
+ return page;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -2168,7 +2204,7 @@ EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
- /* these checks mirror the abort conditions in vm_normal_page */
+ /* these checks mirror the abort conditions in vm_normal_lru_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
if (pfn_t_devmap(pfn))
@@ -4364,7 +4400,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
- page = vm_normal_page(vma, vmf->address, pte);
+ page = vm_normal_lru_page(vma, vmf->address, pte);
if (!page)
goto out_map;
@@ -527,11 +527,11 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_lru_page(vma, addr, *pte);
if (!page)
continue;
/*
- * vm_normal_page() filters out zero pages, but there might
+ * vm_normal_lru_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
@@ -1614,7 +1614,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
+ follflags = FOLL_GET | FOLL_DUMP | FOLL_LRU;
page = follow_page(vma, addr, follflags);
err = PTR_ERR(page);
@@ -342,7 +342,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
*
* The rest of @pvec is filled by subsequent pages within the same pmd and same
- * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * zone, as long as the pte's are present and vm_normal_lru_page() succeeds. These
* pages also get pinned.
*
* Returns the address of the next page that should be scanned. This equals
@@ -373,7 +373,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
struct page *page = NULL;
pte++;
if (pte_present(*pte))
- page = vm_normal_page(vma, start, *pte);
+ page = vm_normal_lru_page(vma, start, *pte);
/*
* Break if page could not be obtained or the page's node+zone does not
* match
@@ -439,7 +439,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* suits munlock very well (and if somehow an abnormal page
* has sneaked into the range, we won't oops here: great).
*/
- page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP | FOLL_LRU);
if (page && !IS_ERR(page)) {
if (PageTransTail(page)) {
@@ -88,7 +88,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (pte_protnone(oldpte))
continue;
- page = vm_normal_page(vma, addr, oldpte);
+ page = vm_normal_lru_page(vma, addr, oldpte);
if (!page || PageKsm(page))
continue;