diff mbox series

[v3,02/13] mm: handling Non-LRU pages returned by vm_normal_pages

Message ID 20220524190632.3304-3-alex.sierra@amd.com (mailing list archive)
State New
Headers show
Series Add MEMORY_DEVICE_COHERENT for coherent device memory mapping | expand

Commit Message

Alex Sierra May 24, 2022, 7:06 p.m. UTC
With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 fs/proc/task_mmu.c | 2 +-
 include/linux/mm.h | 3 ++-
 mm/gup.c           | 2 ++
 mm/huge_memory.c   | 2 +-
 mm/khugepaged.c    | 9 ++++++---
 mm/ksm.c           | 6 +++---
 mm/madvise.c       | 4 ++--
 mm/memory.c        | 9 ++++++++-
 mm/mempolicy.c     | 2 +-
 mm/migrate.c       | 4 ++--
 mm/mlock.c         | 2 +-
 mm/mprotect.c      | 2 +-
 12 files changed, 30 insertions(+), 17 deletions(-)

Comments

Alistair Popple May 25, 2022, 4:11 a.m. UTC | #1
Alex Sierra <alex.sierra@amd.com> writes:

> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
> device-managed anonymous pages that are not LRU pages. Although they
> behave like normal pages for purposes of mapping in CPU page, and for
> COW. They do not support LRU lists, NUMA migration or THP.
>
> We also introduced a FOLL_LRU flag that adds the same behaviour to
> follow_page and related APIs, to allow callers to specify that they
> expect to put pages on an LRU list.

Continuing the follow up from the thread for v2:

>> This means by default GUP can return non-LRU pages. I didn't see
>> anywhere that would be a problem but I didn't check everything. Did you
>> check this or is there some other reason I've missed that makes this not
>> a problem?

> I have double checked all gup and pin_user_pages callers and none of them seem
> to have interaction with LRU APIs.

And actually if I'm understanding things correctly callers of
GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
returned:

    page = vm_normal_page(vma, address, pte);
    if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
        page = NULL;
    if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
        /*
         * Only return device mapping pages in the FOLL_GET or FOLL_PIN
         * case since they are only valid while holding the pgmap
         * reference.
         */
        *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
        if (*pgmap)
            page = pte_page(pte);

Which I think makes FOLL_LRU confusing, because if understand correctly
even with FOLL_LRU it is still possible for follow_page_pte() to return
a non-LRU page. Could we do something like this to make it consistent:

    if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
        !page && pte_devmap(pte)))

Looking at callers that currently use FOLL_LRU I don't think this would
change any behaviour as they already filter out devmap through various
other means.

>
> Signed-off-by: Alex Sierra <alex.sierra@amd.com>
> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  fs/proc/task_mmu.c | 2 +-
>  include/linux/mm.h | 3 ++-
>  mm/gup.c           | 2 ++
>  mm/huge_memory.c   | 2 +-
>  mm/khugepaged.c    | 9 ++++++---
>  mm/ksm.c           | 6 +++---
>  mm/madvise.c       | 4 ++--
>  mm/memory.c        | 9 ++++++++-
>  mm/mempolicy.c     | 2 +-
>  mm/migrate.c       | 4 ++--
>  mm/mlock.c         | 2 +-
>  mm/mprotect.c      | 2 +-
>  12 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index f46060eb91b5..5d620733f173 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>  		return NULL;
>
>  	page = vm_normal_page(vma, addr, pte);
> -	if (!page)
> +	if (!page || is_zone_device_page(page))
>  		return NULL;
>
>  	if (PageReserved(page))
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9f44254af8ce..d7f253a0c41e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>  #endif
>  	/*
>  	 * Called by vm_normal_page() for special PTEs to find the
> -	 * page for @addr.  This is useful if the default behavior
> +	 * page for @addr. This is useful if the default behavior
>  	 * (using pte_page()) would not find the correct page.
>  	 */
>  	struct page *(*find_special_page)(struct vm_area_struct *vma,
> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>  #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>  #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
>  #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
> +#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
>  #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
>  #define FOLL_COW	0x4000	/* internal GUP flag */
>  #define FOLL_ANON	0x8000	/* don't do file mappings */
> diff --git a/mm/gup.c b/mm/gup.c
> index 501bc150792c..c9cbac06bcc5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>  	}
>
>  	page = vm_normal_page(vma, address, pte);
> +	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
> +		page = NULL;
>  	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>  		/*
>  		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 910a138e9859..eed80696c5fd 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  		}
>
>  		/* FOLL_DUMP to ignore special (like zero) pages */
> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> +		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
>  		if (IS_ERR(page))
>  			continue;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a4e5eaf3eb01..8bf4126b6b9c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			goto out;
>  		}
>  		page = vm_normal_page(vma, address, pteval);
> -		if (unlikely(!page)) {
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>  			result = SCAN_PAGE_NULL;
>  			goto out;
>  		}
> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  			writable = true;
>
>  		page = vm_normal_page(vma, _address, pteval);
> -		if (unlikely(!page)) {
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>  			result = SCAN_PAGE_NULL;
>  			goto out_unmap;
>  		}
> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>  			goto abort;
>
>  		page = vm_normal_page(vma, addr, *pte);
> -
> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
> +			page = NULL;
>  		/*
>  		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>  		 * page table, but the new page will not be a subpage of hpage.
> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>  		if (pte_none(*pte))
>  			continue;
>  		page = vm_normal_page(vma, addr, *pte);
> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
> +			goto abort;
>  		page_remove_rmap(page, vma, false);
>  	}
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 063a48eeb5ee..f16056efca21 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>  	do {
>  		cond_resched();
>  		page = follow_page(vma, addr,
> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>  		if (IS_ERR_OR_NULL(page))
>  			break;
>  		if (PageKsm(page))
> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>  	if (!vma)
>  		goto out;
>
> -	page = follow_page(vma, addr, FOLL_GET);
> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>  	if (IS_ERR_OR_NULL(page))
>  		goto out;
>  	if (PageAnon(page)) {
> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  		while (ksm_scan.address < vma->vm_end) {
>  			if (ksm_test_exit(mm))
>  				break;
> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>  			if (IS_ERR_OR_NULL(*page)) {
>  				ksm_scan.address += PAGE_SIZE;
>  				cond_resched();
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 1873616a37d2..e9c24c834e98 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  			continue;
>
>  		page = vm_normal_page(vma, addr, ptent);
> -		if (!page)
> +		if (!page || is_zone_device_page(page))
>  			continue;
>
>  		/*
> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  		}
>
>  		page = vm_normal_page(vma, addr, ptent);
> -		if (!page)
> +		if (!page || is_zone_device_page(page))
>  			continue;
>
>  		/*
> diff --git a/mm/memory.c b/mm/memory.c
> index 76e3af9639d9..571a26805ee1 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>  		if (is_zero_pfn(pfn))
>  			return NULL;
>  		if (pte_devmap(pte))
> +/*
> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
> + * refcounts incremented on their struct pages when they are inserted into
> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
> + */
>  			return NULL;
>
>  		print_bad_pte(vma, addr, pte, NULL);
> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  	pte = pte_modify(old_pte, vma->vm_page_prot);
>
>  	page = vm_normal_page(vma, vmf->address, pte);
> -	if (!page)
> +	if (!page || is_zone_device_page(page))
>  		goto out_map;
>
>  	/* TODO: handle PTE-mapped THP */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 8c74107a2b15..e32edbecb0cd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  		if (!pte_present(*pte))
>  			continue;
>  		page = vm_normal_page(vma, addr, *pte);
> -		if (!page)
> +		if (!page || is_zone_device_page(page))
>  			continue;
>  		/*
>  		 * vm_normal_page() filters out zero pages, but there might
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6c31ee1e1c9b..c5d50e96ecd7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>  		goto out;
>
>  	/* FOLL_DUMP to ignore special (like zero) pages */
> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
>  	err = PTR_ERR(page);
>  	if (IS_ERR(page))
> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>  			goto set_status;
>
>  		/* FOLL_DUMP to ignore special (like zero) pages */
> -		page = follow_page(vma, addr, FOLL_DUMP);
> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>
>  		err = PTR_ERR(page);
>  		if (IS_ERR(page))
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 716caf851043..b14e929084cc 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>  		if (!pte_present(*pte))
>  			continue;
>  		page = vm_normal_page(vma, addr, *pte);
> -		if (!page)
> +		if (!page || is_zone_device_page(page))
>  			continue;
>  		if (PageTransCompound(page))
>  			continue;
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index b69ce7a7b2b7..a6f3587ea29a 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  					continue;
>
>  				page = vm_normal_page(vma, addr, oldpte);
> -				if (!page || PageKsm(page))
> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>  					continue;
>
>  				/* Also skip shared copy-on-write pages */
Alex Sierra May 26, 2022, 4:07 p.m. UTC | #2
On 5/24/2022 11:11 PM, Alistair Popple wrote:
> Alex Sierra <alex.sierra@amd.com> writes:
>
>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>> device-managed anonymous pages that are not LRU pages. Although they
>> behave like normal pages for purposes of mapping in CPU page, and for
>> COW. They do not support LRU lists, NUMA migration or THP.
>>
>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>> follow_page and related APIs, to allow callers to specify that they
>> expect to put pages on an LRU list.
> Continuing the follow up from the thread for v2:
>
>>> This means by default GUP can return non-LRU pages. I didn't see
>>> anywhere that would be a problem but I didn't check everything. Did you
>>> check this or is there some other reason I've missed that makes this not
>>> a problem?
>> I have double checked all gup and pin_user_pages callers and none of them seem
>> to have interaction with LRU APIs.
> And actually if I'm understanding things correctly callers of
> GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
> returned:
>
>      page = vm_normal_page(vma, address, pte);
>      if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>          page = NULL;
>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>          /*
>           * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>           * case since they are only valid while holding the pgmap
>           * reference.
>           */
>          *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
>          if (*pgmap)
>              page = pte_page(pte);
>
> Which I think makes FOLL_LRU confusing, because if understand correctly
> even with FOLL_LRU it is still possible for follow_page_pte() to return
> a non-LRU page. Could we do something like this to make it consistent:
>
>      if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
>          !page && pte_devmap(pte)))

Hi Alistair,
Not sure if this suggestion is a replacement for the first or the second 
condition in the snip code above. We know device coherent type will not 
be set with devmap. So we could do the following:

  if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
-	page = NULL;
+	goto no_page;

Regards,
Alex Sierra

>
> Looking at callers that currently use FOLL_LRU I don't think this would
> change any behaviour as they already filter out devmap through various
> other means.
>
>> Signed-off-by: Alex Sierra <alex.sierra@amd.com>
>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   fs/proc/task_mmu.c | 2 +-
>>   include/linux/mm.h | 3 ++-
>>   mm/gup.c           | 2 ++
>>   mm/huge_memory.c   | 2 +-
>>   mm/khugepaged.c    | 9 ++++++---
>>   mm/ksm.c           | 6 +++---
>>   mm/madvise.c       | 4 ++--
>>   mm/memory.c        | 9 ++++++++-
>>   mm/mempolicy.c     | 2 +-
>>   mm/migrate.c       | 4 ++--
>>   mm/mlock.c         | 2 +-
>>   mm/mprotect.c      | 2 +-
>>   12 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index f46060eb91b5..5d620733f173 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>>   		return NULL;
>>
>>   	page = vm_normal_page(vma, addr, pte);
>> -	if (!page)
>> +	if (!page || is_zone_device_page(page))
>>   		return NULL;
>>
>>   	if (PageReserved(page))
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9f44254af8ce..d7f253a0c41e 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>>   #endif
>>   	/*
>>   	 * Called by vm_normal_page() for special PTEs to find the
>> -	 * page for @addr.  This is useful if the default behavior
>> +	 * page for @addr. This is useful if the default behavior
>>   	 * (using pte_page()) would not find the correct page.
>>   	 */
>>   	struct page *(*find_special_page)(struct vm_area_struct *vma,
>> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>>   #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>>   #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
>>   #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>> +#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
>>   #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
>>   #define FOLL_COW	0x4000	/* internal GUP flag */
>>   #define FOLL_ANON	0x8000	/* don't do file mappings */
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 501bc150792c..c9cbac06bcc5 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>   	}
>>
>>   	page = vm_normal_page(vma, address, pte);
>> +	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>> +		page = NULL;
>>   	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>   		/*
>>   		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 910a138e9859..eed80696c5fd 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>   		}
>>
>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> +		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>>   		if (IS_ERR(page))
>>   			continue;
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4e5eaf3eb01..8bf4126b6b9c 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   			goto out;
>>   		}
>>   		page = vm_normal_page(vma, address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out;
>>   		}
>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>   			writable = true;
>>
>>   		page = vm_normal_page(vma, _address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out_unmap;
>>   		}
>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   			goto abort;
>>
>>   		page = vm_normal_page(vma, addr, *pte);
>> -
>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> +			page = NULL;
>>   		/*
>>   		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>>   		 * page table, but the new page will not be a subpage of hpage.
>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   		if (pte_none(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> +			goto abort;
>>   		page_remove_rmap(page, vma, false);
>>   	}
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 063a48eeb5ee..f16056efca21 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>>   	do {
>>   		cond_resched();
>>   		page = follow_page(vma, addr,
>> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>>   		if (IS_ERR_OR_NULL(page))
>>   			break;
>>   		if (PageKsm(page))
>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>>   	if (!vma)
>>   		goto out;
>>
>> -	page = follow_page(vma, addr, FOLL_GET);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>>   	if (IS_ERR_OR_NULL(page))
>>   		goto out;
>>   	if (PageAnon(page)) {
>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>>   		while (ksm_scan.address < vma->vm_end) {
>>   			if (ksm_test_exit(mm))
>>   				break;
>> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>>   			if (IS_ERR_OR_NULL(*page)) {
>>   				ksm_scan.address += PAGE_SIZE;
>>   				cond_resched();
>> diff --git a/mm/madvise.c b/mm/madvise.c
>> index 1873616a37d2..e9c24c834e98 100644
>> --- a/mm/madvise.c
>> +++ b/mm/madvise.c
>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>>   			continue;
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>   		}
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 76e3af9639d9..571a26805ee1 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>   		if (is_zero_pfn(pfn))
>>   			return NULL;
>>   		if (pte_devmap(pte))
>> +/*
>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>> + * refcounts incremented on their struct pages when they are inserted into
>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>> + */
>>   			return NULL;
>>
>>   		print_bad_pte(vma, addr, pte, NULL);
>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>   	pte = pte_modify(old_pte, vma->vm_page_prot);
>>
>>   	page = vm_normal_page(vma, vmf->address, pte);
>> -	if (!page)
>> +	if (!page || is_zone_device_page(page))
>>   		goto out_map;
>>
>>   	/* TODO: handle PTE-mapped THP */
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 8c74107a2b15..e32edbecb0cd 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		/*
>>   		 * vm_normal_page() filters out zero pages, but there might
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>>   		goto out;
>>
>>   	/* FOLL_DUMP to ignore special (like zero) pages */
>> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>>   	err = PTR_ERR(page);
>>   	if (IS_ERR(page))
>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>>   			goto set_status;
>>
>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_DUMP);
>> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>
>>   		err = PTR_ERR(page);
>>   		if (IS_ERR(page))
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 716caf851043..b14e929084cc 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		if (PageTransCompound(page))
>>   			continue;
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index b69ce7a7b2b7..a6f3587ea29a 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>>   					continue;
>>
>>   				page = vm_normal_page(vma, addr, oldpte);
>> -				if (!page || PageKsm(page))
>> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>>   					continue;
>>
>>   				/* Also skip shared copy-on-write pages */
Alistair Popple May 27, 2022, 1:28 a.m. UTC | #3
"Sierra Guiza, Alejandro (Alex)" <alex.sierra@amd.com> writes:

> On 5/24/2022 11:11 PM, Alistair Popple wrote:
>> Alex Sierra <alex.sierra@amd.com> writes:
>>
>>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>>> device-managed anonymous pages that are not LRU pages. Although they
>>> behave like normal pages for purposes of mapping in CPU page, and for
>>> COW. They do not support LRU lists, NUMA migration or THP.
>>>
>>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>>> follow_page and related APIs, to allow callers to specify that they
>>> expect to put pages on an LRU list.
>> Continuing the follow up from the thread for v2:
>>
>>>> This means by default GUP can return non-LRU pages. I didn't see
>>>> anywhere that would be a problem but I didn't check everything. Did you
>>>> check this or is there some other reason I've missed that makes this not
>>>> a problem?
>>> I have double checked all gup and pin_user_pages callers and none of them seem
>>> to have interaction with LRU APIs.
>> And actually if I'm understanding things correctly callers of
>> GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
>> returned:
>>
>>      page = vm_normal_page(vma, address, pte);
>>      if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>>          page = NULL;
>>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>          /*
>>           * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>>           * case since they are only valid while holding the pgmap
>>           * reference.
>>           */
>>          *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
>>          if (*pgmap)
>>              page = pte_page(pte);
>>
>> Which I think makes FOLL_LRU confusing, because if understand correctly
>> even with FOLL_LRU it is still possible for follow_page_pte() to return
>> a non-LRU page. Could we do something like this to make it consistent:
>>
>>      if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
>>          !page && pte_devmap(pte)))
>
> Hi Alistair,
> Not sure if this suggestion is a replacement for the first or the second
> condition in the snip code above. We know device coherent type will not
> be set with devmap. So we could do the following:

Sorry, I must not have been clear enough. My understanding is if the
following condition is true:

>>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {

Then follow_page_pte() could return a non-LRU page even when FOLL_LRU is
specified (because I think a devmap page is a non-LRU page). That seems
confusing, so for consistency I was suggesting we should not return
devmap pages for FOLL_LRU.

To be clear I don't think there is an actual problem here atm, but the
inconsistency could easily lead to one in future.

>  if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
> -	page = NULL;
> +	goto no_page;
>
> Regards,
> Alex Sierra
>
>>
>> Looking at callers that currently use FOLL_LRU I don't think this would
>> change any behaviour as they already filter out devmap through various
>> other means.
>>
>>> Signed-off-by: Alex Sierra <alex.sierra@amd.com>
>>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>   fs/proc/task_mmu.c | 2 +-
>>>   include/linux/mm.h | 3 ++-
>>>   mm/gup.c           | 2 ++
>>>   mm/huge_memory.c   | 2 +-
>>>   mm/khugepaged.c    | 9 ++++++---
>>>   mm/ksm.c           | 6 +++---
>>>   mm/madvise.c       | 4 ++--
>>>   mm/memory.c        | 9 ++++++++-
>>>   mm/mempolicy.c     | 2 +-
>>>   mm/migrate.c       | 4 ++--
>>>   mm/mlock.c         | 2 +-
>>>   mm/mprotect.c      | 2 +-
>>>   12 files changed, 30 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>> index f46060eb91b5..5d620733f173 100644
>>> --- a/fs/proc/task_mmu.c
>>> +++ b/fs/proc/task_mmu.c
>>> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>>>   		return NULL;
>>>
>>>   	page = vm_normal_page(vma, addr, pte);
>>> -	if (!page)
>>> +	if (!page || is_zone_device_page(page))
>>>   		return NULL;
>>>
>>>   	if (PageReserved(page))
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 9f44254af8ce..d7f253a0c41e 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>>>   #endif
>>>   	/*
>>>   	 * Called by vm_normal_page() for special PTEs to find the
>>> -	 * page for @addr.  This is useful if the default behavior
>>> +	 * page for @addr. This is useful if the default behavior
>>>   	 * (using pte_page()) would not find the correct page.
>>>   	 */
>>>   	struct page *(*find_special_page)(struct vm_area_struct *vma,
>>> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>>>   #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>>>   #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
>>>   #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>>> +#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
>>>   #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
>>>   #define FOLL_COW	0x4000	/* internal GUP flag */
>>>   #define FOLL_ANON	0x8000	/* don't do file mappings */
>>> diff --git a/mm/gup.c b/mm/gup.c
>>> index 501bc150792c..c9cbac06bcc5 100644
>>> --- a/mm/gup.c
>>> +++ b/mm/gup.c
>>> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>>   	}
>>>
>>>   	page = vm_normal_page(vma, address, pte);
>>> +	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>>> +		page = NULL;
>>>   	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>>   		/*
>>>   		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 910a138e9859..eed80696c5fd 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>>   		}
>>>
>>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>>> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>>> +		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>>
>>>   		if (IS_ERR(page))
>>>   			continue;
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index a4e5eaf3eb01..8bf4126b6b9c 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>   			goto out;
>>>   		}
>>>   		page = vm_normal_page(vma, address, pteval);
>>> -		if (unlikely(!page)) {
>>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>>   			result = SCAN_PAGE_NULL;
>>>   			goto out;
>>>   		}
>>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>>   			writable = true;
>>>
>>>   		page = vm_normal_page(vma, _address, pteval);
>>> -		if (unlikely(!page)) {
>>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>>   			result = SCAN_PAGE_NULL;
>>>   			goto out_unmap;
>>>   		}
>>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>>   			goto abort;
>>>
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -
>>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>>> +			page = NULL;
>>>   		/*
>>>   		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>>>   		 * page table, but the new page will not be a subpage of hpage.
>>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>>   		if (pte_none(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>>> +			goto abort;
>>>   		page_remove_rmap(page, vma, false);
>>>   	}
>>>
>>> diff --git a/mm/ksm.c b/mm/ksm.c
>>> index 063a48eeb5ee..f16056efca21 100644
>>> --- a/mm/ksm.c
>>> +++ b/mm/ksm.c
>>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>>>   	do {
>>>   		cond_resched();
>>>   		page = follow_page(vma, addr,
>>> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>>> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>>>   		if (IS_ERR_OR_NULL(page))
>>>   			break;
>>>   		if (PageKsm(page))
>>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>>>   	if (!vma)
>>>   		goto out;
>>>
>>> -	page = follow_page(vma, addr, FOLL_GET);
>>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>>>   	if (IS_ERR_OR_NULL(page))
>>>   		goto out;
>>>   	if (PageAnon(page)) {
>>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>>>   		while (ksm_scan.address < vma->vm_end) {
>>>   			if (ksm_test_exit(mm))
>>>   				break;
>>> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>>> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>>>   			if (IS_ERR_OR_NULL(*page)) {
>>>   				ksm_scan.address += PAGE_SIZE;
>>>   				cond_resched();
>>> diff --git a/mm/madvise.c b/mm/madvise.c
>>> index 1873616a37d2..e9c24c834e98 100644
>>> --- a/mm/madvise.c
>>> +++ b/mm/madvise.c
>>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>>>   			continue;
>>>
>>>   		page = vm_normal_page(vma, addr, ptent);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>
>>>   		/*
>>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		}
>>>
>>>   		page = vm_normal_page(vma, addr, ptent);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>
>>>   		/*
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 76e3af9639d9..571a26805ee1 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>>   		if (is_zero_pfn(pfn))
>>>   			return NULL;
>>>   		if (pte_devmap(pte))
>>> +/*
>>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>>> + * refcounts incremented on their struct pages when they are inserted into
>>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>>> + */
>>>   			return NULL;
>>>
>>>   		print_bad_pte(vma, addr, pte, NULL);
>>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>>   	pte = pte_modify(old_pte, vma->vm_page_prot);
>>>
>>>   	page = vm_normal_page(vma, vmf->address, pte);
>>> -	if (!page)
>>> +	if (!page || is_zone_device_page(page))
>>>   		goto out_map;
>>>
>>>   	/* TODO: handle PTE-mapped THP */
>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>> index 8c74107a2b15..e32edbecb0cd 100644
>>> --- a/mm/mempolicy.c
>>> +++ b/mm/mempolicy.c
>>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		if (!pte_present(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>   		/*
>>>   		 * vm_normal_page() filters out zero pages, but there might
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>>>   		goto out;
>>>
>>>   	/* FOLL_DUMP to ignore special (like zero) pages */
>>> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>>
>>>   	err = PTR_ERR(page);
>>>   	if (IS_ERR(page))
>>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>>>   			goto set_status;
>>>
>>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>>> -		page = follow_page(vma, addr, FOLL_DUMP);
>>> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>>
>>>   		err = PTR_ERR(page);
>>>   		if (IS_ERR(page))
>>> diff --git a/mm/mlock.c b/mm/mlock.c
>>> index 716caf851043..b14e929084cc 100644
>>> --- a/mm/mlock.c
>>> +++ b/mm/mlock.c
>>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		if (!pte_present(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>   		if (PageTransCompound(page))
>>>   			continue;
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index b69ce7a7b2b7..a6f3587ea29a 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>>>   					continue;
>>>
>>>   				page = vm_normal_page(vma, addr, oldpte);
>>> -				if (!page || PageKsm(page))
>>> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>>>   					continue;
>>>
>>>   				/* Also skip shared copy-on-write pages */
Felix Kuehling May 27, 2022, 6:28 a.m. UTC | #4
Am 2022-05-25 um 00:11 schrieb Alistair Popple:
> Alex Sierra <alex.sierra@amd.com> writes:
>
>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>> device-managed anonymous pages that are not LRU pages. Although they
>> behave like normal pages for purposes of mapping in CPU page, and for
>> COW. They do not support LRU lists, NUMA migration or THP.
>>
>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>> follow_page and related APIs, to allow callers to specify that they
>> expect to put pages on an LRU list.
> Continuing the follow up from the thread for v2:
>
>>> This means by default GUP can return non-LRU pages. I didn't see
>>> anywhere that would be a problem but I didn't check everything. Did you
>>> check this or is there some other reason I've missed that makes this not
>>> a problem?
>> I have double checked all gup and pin_user_pages callers and none of them seem
>> to have interaction with LRU APIs.
> And actually if I'm understanding things correctly callers of
> GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
> returned:
>
>      page = vm_normal_page(vma, address, pte);
>      if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>          page = NULL;
>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>          /*
>           * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>           * case since they are only valid while holding the pgmap
>           * reference.
>           */
>          *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
>          if (*pgmap)
>              page = pte_page(pte);
>
> Which I think makes FOLL_LRU confusing, because if understand correctly
> even with FOLL_LRU it is still possible for follow_page_pte() to return
> a non-LRU page. Could we do something like this to make it consistent:
>
>      if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
>          !page && pte_devmap(pte)))

This alone won't help if it still goes into the if (!page && 
pte_devmap(pte) ...) afterwards. I think what you're suggesting is:

+	if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
+				   !page && pte_devmap(pte)))
+		page = NULL;
-	|if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + 
else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { |

Is that what you meant?

Regards,
   Felix


>
> Looking at callers that currently use FOLL_LRU I don't think this would
> change any behaviour as they already filter out devmap through various
> other means.
>
>> Signed-off-by: Alex Sierra <alex.sierra@amd.com>
>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   fs/proc/task_mmu.c | 2 +-
>>   include/linux/mm.h | 3 ++-
>>   mm/gup.c           | 2 ++
>>   mm/huge_memory.c   | 2 +-
>>   mm/khugepaged.c    | 9 ++++++---
>>   mm/ksm.c           | 6 +++---
>>   mm/madvise.c       | 4 ++--
>>   mm/memory.c        | 9 ++++++++-
>>   mm/mempolicy.c     | 2 +-
>>   mm/migrate.c       | 4 ++--
>>   mm/mlock.c         | 2 +-
>>   mm/mprotect.c      | 2 +-
>>   12 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index f46060eb91b5..5d620733f173 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>>   		return NULL;
>>
>>   	page = vm_normal_page(vma, addr, pte);
>> -	if (!page)
>> +	if (!page || is_zone_device_page(page))
>>   		return NULL;
>>
>>   	if (PageReserved(page))
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9f44254af8ce..d7f253a0c41e 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>>   #endif
>>   	/*
>>   	 * Called by vm_normal_page() for special PTEs to find the
>> -	 * page for @addr.  This is useful if the default behavior
>> +	 * page for @addr. This is useful if the default behavior
>>   	 * (using pte_page()) would not find the correct page.
>>   	 */
>>   	struct page *(*find_special_page)(struct vm_area_struct *vma,
>> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>>   #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>>   #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
>>   #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>> +#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
>>   #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
>>   #define FOLL_COW	0x4000	/* internal GUP flag */
>>   #define FOLL_ANON	0x8000	/* don't do file mappings */
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 501bc150792c..c9cbac06bcc5 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>   	}
>>
>>   	page = vm_normal_page(vma, address, pte);
>> +	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>> +		page = NULL;
>>   	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>   		/*
>>   		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 910a138e9859..eed80696c5fd 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>   		}
>>
>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> +		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>>   		if (IS_ERR(page))
>>   			continue;
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4e5eaf3eb01..8bf4126b6b9c 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   			goto out;
>>   		}
>>   		page = vm_normal_page(vma, address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out;
>>   		}
>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>   			writable = true;
>>
>>   		page = vm_normal_page(vma, _address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out_unmap;
>>   		}
>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   			goto abort;
>>
>>   		page = vm_normal_page(vma, addr, *pte);
>> -
>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> +			page = NULL;
>>   		/*
>>   		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>>   		 * page table, but the new page will not be a subpage of hpage.
>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   		if (pte_none(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> +			goto abort;
>>   		page_remove_rmap(page, vma, false);
>>   	}
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 063a48eeb5ee..f16056efca21 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>>   	do {
>>   		cond_resched();
>>   		page = follow_page(vma, addr,
>> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>>   		if (IS_ERR_OR_NULL(page))
>>   			break;
>>   		if (PageKsm(page))
>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>>   	if (!vma)
>>   		goto out;
>>
>> -	page = follow_page(vma, addr, FOLL_GET);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>>   	if (IS_ERR_OR_NULL(page))
>>   		goto out;
>>   	if (PageAnon(page)) {
>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>>   		while (ksm_scan.address < vma->vm_end) {
>>   			if (ksm_test_exit(mm))
>>   				break;
>> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>>   			if (IS_ERR_OR_NULL(*page)) {
>>   				ksm_scan.address += PAGE_SIZE;
>>   				cond_resched();
>> diff --git a/mm/madvise.c b/mm/madvise.c
>> index 1873616a37d2..e9c24c834e98 100644
>> --- a/mm/madvise.c
>> +++ b/mm/madvise.c
>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>>   			continue;
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>   		}
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 76e3af9639d9..571a26805ee1 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>   		if (is_zero_pfn(pfn))
>>   			return NULL;
>>   		if (pte_devmap(pte))
>> +/*
>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>> + * refcounts incremented on their struct pages when they are inserted into
>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>> + */
>>   			return NULL;
>>
>>   		print_bad_pte(vma, addr, pte, NULL);
>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>   	pte = pte_modify(old_pte, vma->vm_page_prot);
>>
>>   	page = vm_normal_page(vma, vmf->address, pte);
>> -	if (!page)
>> +	if (!page || is_zone_device_page(page))
>>   		goto out_map;
>>
>>   	/* TODO: handle PTE-mapped THP */
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 8c74107a2b15..e32edbecb0cd 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		/*
>>   		 * vm_normal_page() filters out zero pages, but there might
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>>   		goto out;
>>
>>   	/* FOLL_DUMP to ignore special (like zero) pages */
>> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>>   	err = PTR_ERR(page);
>>   	if (IS_ERR(page))
>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>>   			goto set_status;
>>
>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_DUMP);
>> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>
>>   		err = PTR_ERR(page);
>>   		if (IS_ERR(page))
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 716caf851043..b14e929084cc 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		if (PageTransCompound(page))
>>   			continue;
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index b69ce7a7b2b7..a6f3587ea29a 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>>   					continue;
>>
>>   				page = vm_normal_page(vma, addr, oldpte);
>> -				if (!page || PageKsm(page))
>> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>>   					continue;
>>
>>   				/* Also skip shared copy-on-write pages */
Alistair Popple May 27, 2022, 8:02 a.m. UTC | #5
Felix Kuehling <felix.kuehling@amd.com> writes:

> Am 2022-05-25 um 00:11 schrieb Alistair Popple:
>> Alex Sierra <alex.sierra@amd.com> writes:
>>
>>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>>> device-managed anonymous pages that are not LRU pages. Although they
>>> behave like normal pages for purposes of mapping in CPU page, and for
>>> COW. They do not support LRU lists, NUMA migration or THP.
>>>
>>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>>> follow_page and related APIs, to allow callers to specify that they
>>> expect to put pages on an LRU list.
>> Continuing the follow up from the thread for v2:
>>
>>>> This means by default GUP can return non-LRU pages. I didn't see
>>>> anywhere that would be a problem but I didn't check everything. Did you
>>>> check this or is there some other reason I've missed that makes this not
>>>> a problem?
>>> I have double checked all gup and pin_user_pages callers and none of them seem
>>> to have interaction with LRU APIs.
>> And actually if I'm understanding things correctly callers of
>> GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
>> returned:
>>
>>      page = vm_normal_page(vma, address, pte);
>>      if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>>          page = NULL;
>>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>          /*
>>           * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>>           * case since they are only valid while holding the pgmap
>>           * reference.
>>           */
>>          *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
>>          if (*pgmap)
>>              page = pte_page(pte);
>>
>> Which I think makes FOLL_LRU confusing, because if understand correctly
>> even with FOLL_LRU it is still possible for follow_page_pte() to return
>> a non-LRU page. Could we do something like this to make it consistent:
>>
>>      if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
>>          !page && pte_devmap(pte)))
>
> This alone won't help if it still goes into the if (!page && pte_devmap(pte)
> ...) afterwards. I think what you're suggesting is:
>
> +	if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
> +				   !page && pte_devmap(pte)))
> +		page = NULL;
> -	|if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + else
>      if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { |
>
> Is that what you meant?

Oh my bad. Yes, that is what I meant. Although as Alex pointed out we
should goto no_page as well. However we also need to fix up the return
code, because returning NULL will cause GUP to try and fault the page in
when it already possibly exists. So I think something like this should
work:

      page = vm_normal_page(vma, address, pte);
      if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
          !page && pte_devmap(pte))) {
          page = ERR_PTR(-EEXIST);
          goto out;
      } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
          /*
           * Only return device mapping pages in the FOLL_GET or FOLL_PIN
           * case since they are only valid while holding the pgmap
           * reference.
           */
          *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
          if (*pgmap)
              page = pte_page(pte);

> Regards,
>   Felix
>
>
>>
>> Looking at callers that currently use FOLL_LRU I don't think this would
>> change any behaviour as they already filter out devmap through various
>> other means.
>>
>>> Signed-off-by: Alex Sierra <alex.sierra@amd.com>
>>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>   fs/proc/task_mmu.c | 2 +-
>>>   include/linux/mm.h | 3 ++-
>>>   mm/gup.c           | 2 ++
>>>   mm/huge_memory.c   | 2 +-
>>>   mm/khugepaged.c    | 9 ++++++---
>>>   mm/ksm.c           | 6 +++---
>>>   mm/madvise.c       | 4 ++--
>>>   mm/memory.c        | 9 ++++++++-
>>>   mm/mempolicy.c     | 2 +-
>>>   mm/migrate.c       | 4 ++--
>>>   mm/mlock.c         | 2 +-
>>>   mm/mprotect.c      | 2 +-
>>>   12 files changed, 30 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>> index f46060eb91b5..5d620733f173 100644
>>> --- a/fs/proc/task_mmu.c
>>> +++ b/fs/proc/task_mmu.c
>>> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>>>   		return NULL;
>>>
>>>   	page = vm_normal_page(vma, addr, pte);
>>> -	if (!page)
>>> +	if (!page || is_zone_device_page(page))
>>>   		return NULL;
>>>
>>>   	if (PageReserved(page))
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 9f44254af8ce..d7f253a0c41e 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>>>   #endif
>>>   	/*
>>>   	 * Called by vm_normal_page() for special PTEs to find the
>>> -	 * page for @addr.  This is useful if the default behavior
>>> +	 * page for @addr. This is useful if the default behavior
>>>   	 * (using pte_page()) would not find the correct page.
>>>   	 */
>>>   	struct page *(*find_special_page)(struct vm_area_struct *vma,
>>> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>>>   #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>>>   #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
>>>   #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>>> +#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
>>>   #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
>>>   #define FOLL_COW	0x4000	/* internal GUP flag */
>>>   #define FOLL_ANON	0x8000	/* don't do file mappings */
>>> diff --git a/mm/gup.c b/mm/gup.c
>>> index 501bc150792c..c9cbac06bcc5 100644
>>> --- a/mm/gup.c
>>> +++ b/mm/gup.c
>>> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>>   	}
>>>
>>>   	page = vm_normal_page(vma, address, pte);
>>> +	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>>> +		page = NULL;
>>>   	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>>>   		/*
>>>   		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 910a138e9859..eed80696c5fd 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>>   		}
>>>
>>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>>> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>>> +		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>>
>>>   		if (IS_ERR(page))
>>>   			continue;
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index a4e5eaf3eb01..8bf4126b6b9c 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>   			goto out;
>>>   		}
>>>   		page = vm_normal_page(vma, address, pteval);
>>> -		if (unlikely(!page)) {
>>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>>   			result = SCAN_PAGE_NULL;
>>>   			goto out;
>>>   		}
>>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>>   			writable = true;
>>>
>>>   		page = vm_normal_page(vma, _address, pteval);
>>> -		if (unlikely(!page)) {
>>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>>   			result = SCAN_PAGE_NULL;
>>>   			goto out_unmap;
>>>   		}
>>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>>   			goto abort;
>>>
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -
>>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>>> +			page = NULL;
>>>   		/*
>>>   		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>>>   		 * page table, but the new page will not be a subpage of hpage.
>>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>>   		if (pte_none(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> +		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>>> +			goto abort;
>>>   		page_remove_rmap(page, vma, false);
>>>   	}
>>>
>>> diff --git a/mm/ksm.c b/mm/ksm.c
>>> index 063a48eeb5ee..f16056efca21 100644
>>> --- a/mm/ksm.c
>>> +++ b/mm/ksm.c
>>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>>>   	do {
>>>   		cond_resched();
>>>   		page = follow_page(vma, addr,
>>> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>>> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>>>   		if (IS_ERR_OR_NULL(page))
>>>   			break;
>>>   		if (PageKsm(page))
>>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>>>   	if (!vma)
>>>   		goto out;
>>>
>>> -	page = follow_page(vma, addr, FOLL_GET);
>>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>>>   	if (IS_ERR_OR_NULL(page))
>>>   		goto out;
>>>   	if (PageAnon(page)) {
>>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>>>   		while (ksm_scan.address < vma->vm_end) {
>>>   			if (ksm_test_exit(mm))
>>>   				break;
>>> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>>> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>>>   			if (IS_ERR_OR_NULL(*page)) {
>>>   				ksm_scan.address += PAGE_SIZE;
>>>   				cond_resched();
>>> diff --git a/mm/madvise.c b/mm/madvise.c
>>> index 1873616a37d2..e9c24c834e98 100644
>>> --- a/mm/madvise.c
>>> +++ b/mm/madvise.c
>>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>>>   			continue;
>>>
>>>   		page = vm_normal_page(vma, addr, ptent);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>
>>>   		/*
>>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		}
>>>
>>>   		page = vm_normal_page(vma, addr, ptent);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>
>>>   		/*
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 76e3af9639d9..571a26805ee1 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>>   		if (is_zero_pfn(pfn))
>>>   			return NULL;
>>>   		if (pte_devmap(pte))
>>> +/*
>>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>>> + * refcounts incremented on their struct pages when they are inserted into
>>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>>> + */
>>>   			return NULL;
>>>
>>>   		print_bad_pte(vma, addr, pte, NULL);
>>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>>   	pte = pte_modify(old_pte, vma->vm_page_prot);
>>>
>>>   	page = vm_normal_page(vma, vmf->address, pte);
>>> -	if (!page)
>>> +	if (!page || is_zone_device_page(page))
>>>   		goto out_map;
>>>
>>>   	/* TODO: handle PTE-mapped THP */
>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>> index 8c74107a2b15..e32edbecb0cd 100644
>>> --- a/mm/mempolicy.c
>>> +++ b/mm/mempolicy.c
>>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		if (!pte_present(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>   		/*
>>>   		 * vm_normal_page() filters out zero pages, but there might
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>>>   		goto out;
>>>
>>>   	/* FOLL_DUMP to ignore special (like zero) pages */
>>> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>>
>>>   	err = PTR_ERR(page);
>>>   	if (IS_ERR(page))
>>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>>>   			goto set_status;
>>>
>>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>>> -		page = follow_page(vma, addr, FOLL_DUMP);
>>> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>>
>>>   		err = PTR_ERR(page);
>>>   		if (IS_ERR(page))
>>> diff --git a/mm/mlock.c b/mm/mlock.c
>>> index 716caf851043..b14e929084cc 100644
>>> --- a/mm/mlock.c
>>> +++ b/mm/mlock.c
>>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>>>   		if (!pte_present(*pte))
>>>   			continue;
>>>   		page = vm_normal_page(vma, addr, *pte);
>>> -		if (!page)
>>> +		if (!page || is_zone_device_page(page))
>>>   			continue;
>>>   		if (PageTransCompound(page))
>>>   			continue;
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index b69ce7a7b2b7..a6f3587ea29a 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>>>   					continue;
>>>
>>>   				page = vm_normal_page(vma, addr, oldpte);
>>> -				if (!page || PageKsm(page))
>>> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>>>   					continue;
>>>
>>>   				/* Also skip shared copy-on-write pages */
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..5d620733f173 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1785,7 +1785,7 @@  static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 		return NULL;
 
 	page = vm_normal_page(vma, addr, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		return NULL;
 
 	if (PageReserved(page))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9f44254af8ce..d7f253a0c41e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -601,7 +601,7 @@  struct vm_operations_struct {
 #endif
 	/*
 	 * Called by vm_normal_page() for special PTEs to find the
-	 * page for @addr.  This is useful if the default behavior
+	 * page for @addr. This is useful if the default behavior
 	 * (using pte_page()) would not find the correct page.
 	 */
 	struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -2929,6 +2929,7 @@  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
+#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
diff --git a/mm/gup.c b/mm/gup.c
index 501bc150792c..c9cbac06bcc5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -479,6 +479,8 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 
 	page = vm_normal_page(vma, address, pte);
+	if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
+		page = NULL;
 	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 		/*
 		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 910a138e9859..eed80696c5fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2856,7 +2856,7 @@  static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 		}
 
 		/* FOLL_DUMP to ignore special (like zero) pages */
-		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
 		if (IS_ERR(page))
 			continue;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4e5eaf3eb01..8bf4126b6b9c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -627,7 +627,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 		page = vm_normal_page(vma, address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out;
 		}
@@ -1276,7 +1276,7 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 			writable = true;
 
 		page = vm_normal_page(vma, _address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out_unmap;
 		}
@@ -1484,7 +1484,8 @@  void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 			goto abort;
 
 		page = vm_normal_page(vma, addr, *pte);
-
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			page = NULL;
 		/*
 		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 		 * page table, but the new page will not be a subpage of hpage.
@@ -1502,6 +1503,8 @@  void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 		if (pte_none(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			goto abort;
 		page_remove_rmap(page, vma, false);
 	}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 063a48eeb5ee..f16056efca21 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -474,7 +474,7 @@  static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 	do {
 		cond_resched();
 		page = follow_page(vma, addr,
-				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
 		if (IS_ERR_OR_NULL(page))
 			break;
 		if (PageKsm(page))
@@ -559,7 +559,7 @@  static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 	if (!vma)
 		goto out;
 
-	page = follow_page(vma, addr, FOLL_GET);
+	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
 	if (IS_ERR_OR_NULL(page))
 		goto out;
 	if (PageAnon(page)) {
@@ -2288,7 +2288,7 @@  static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 		while (ksm_scan.address < vma->vm_end) {
 			if (ksm_test_exit(mm))
 				break;
-			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
+			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
 			if (IS_ERR_OR_NULL(*page)) {
 				ksm_scan.address += PAGE_SIZE;
 				cond_resched();
diff --git a/mm/madvise.c b/mm/madvise.c
index 1873616a37d2..e9c24c834e98 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -413,7 +413,7 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			continue;
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
@@ -628,7 +628,7 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		}
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
diff --git a/mm/memory.c b/mm/memory.c
index 76e3af9639d9..571a26805ee1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -621,6 +621,13 @@  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		if (is_zero_pfn(pfn))
 			return NULL;
 		if (pte_devmap(pte))
+/*
+ * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
+ * refcounts incremented on their struct pages when they are inserted into
+ * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
+ * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
 			return NULL;
 
 		print_bad_pte(vma, addr, pte, NULL);
@@ -4422,7 +4429,7 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	pte = pte_modify(old_pte, vma->vm_page_prot);
 
 	page = vm_normal_page(vma, vmf->address, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		goto out_map;
 
 	/* TODO: handle PTE-mapped THP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8c74107a2b15..e32edbecb0cd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -528,7 +528,7 @@  static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		/*
 		 * vm_normal_page() filters out zero pages, but there might
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c31ee1e1c9b..c5d50e96ecd7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1611,7 +1611,7 @@  static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 		goto out;
 
 	/* FOLL_DUMP to ignore special (like zero) pages */
-	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
 	err = PTR_ERR(page);
 	if (IS_ERR(page))
@@ -1802,7 +1802,7 @@  static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 			goto set_status;
 
 		/* FOLL_DUMP to ignore special (like zero) pages */
-		page = follow_page(vma, addr, FOLL_DUMP);
+		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
 
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@  static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		if (PageTransCompound(page))
 			continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..a6f3587ea29a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -91,7 +91,7 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 					continue;
 
 				page = vm_normal_page(vma, addr, oldpte);
-				if (!page || PageKsm(page))
+				if (!page || is_zone_device_page(page) || PageKsm(page))
 					continue;
 
 				/* Also skip shared copy-on-write pages */