diff mbox series

[v5,2/2] mm: fix the handling Non-LRU pages returned by follow_page

Message ID 20220815070240.470469-3-haiyue.wang@intel.com (mailing list archive)
State Superseded
Headers show
Series fix follow_page related issues | expand

Commit Message

Haiyue Wang Aug. 15, 2022, 7:02 a.m. UTC
The handling Non-LRU pages returned by follow_page() jumps directly, it
doesn't call put_page() to handle the reference count, since 'FOLL_GET'
flag for follow_page() has get_page() called. Fix the zone device page
check by handling the page reference count correctly before returning.

And as David reviewed, "device pages are never PageKsm pages". Drop this
zone device page check for break_ksm().

Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
---
 mm/huge_memory.c |  4 ++--
 mm/ksm.c         | 12 +++++++++---
 mm/migrate.c     | 10 +++++++---
 3 files changed, 18 insertions(+), 8 deletions(-)

Comments

Huang, Ying Aug. 15, 2022, 7:50 a.m. UTC | #1
Haiyue Wang <haiyue.wang@intel.com> writes:

> The handling Non-LRU pages returned by follow_page() jumps directly, it
> doesn't call put_page() to handle the reference count, since 'FOLL_GET'
> flag for follow_page() has get_page() called. Fix the zone device page
> check by handling the page reference count correctly before returning.
>
> And as David reviewed, "device pages are never PageKsm pages". Drop this
> zone device page check for break_ksm().
>
> Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
> Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>

LGTM, Thanks!

Reviewed-by: "Huang, Ying" <ying.huang@intel.com>

> ---
>  mm/huge_memory.c |  4 ++--
>  mm/ksm.c         | 12 +++++++++---
>  mm/migrate.c     | 10 +++++++---
>  3 files changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8a7c1b344abe..b2ba17c3dcd7 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2963,10 +2963,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  		/* FOLL_DUMP to ignore special (like zero) pages */
>  		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>  
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>  			continue;
>  
> -		if (!is_transparent_hugepage(page))
> +		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
>  			goto next;
>  
>  		total++;
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 42ab153335a2..e26f57fc1f0e 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>  		cond_resched();
>  		page = follow_page(vma, addr,
>  				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>  			break;
>  		if (PageKsm(page))
>  			ret = handle_mm_fault(vma, addr,
> @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>  		goto out;
>  
>  	page = follow_page(vma, addr, FOLL_GET);
> -	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +	if (IS_ERR_OR_NULL(page))
>  		goto out;
> +	if (is_zone_device_page(page))
> +		goto out_putpage;
>  	if (PageAnon(page)) {
>  		flush_anon_page(vma, page, addr);
>  		flush_dcache_page(page);
>  	} else {
> +out_putpage:
>  		put_page(page);
>  out:
>  		page = NULL;
> @@ -2308,11 +2311,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  			if (ksm_test_exit(mm))
>  				break;
>  			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
> -			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
> +			if (IS_ERR_OR_NULL(*page)) {
>  				ksm_scan.address += PAGE_SIZE;
>  				cond_resched();
>  				continue;
>  			}
> +			if (is_zone_device_page(*page))
> +				goto next_page;
>  			if (PageAnon(*page)) {
>  				flush_anon_page(vma, *page, ksm_scan.address);
>  				flush_dcache_page(*page);
> @@ -2327,6 +2332,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  				mmap_read_unlock(mm);
>  				return rmap_item;
>  			}
> +next_page:
>  			put_page(*page);
>  			ksm_scan.address += PAGE_SIZE;
>  			cond_resched();
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 581dfaad9257..fee12cd2f294 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1672,9 +1672,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>  		goto out;
>  
>  	err = -ENOENT;
> -	if (!page || is_zone_device_page(page))
> +	if (!page)
>  		goto out;
>  
> +	if (is_zone_device_page(page))
> +		goto out_putpage;
> +
>  	err = 0;
>  	if (page_to_nid(page) == node)
>  		goto out_putpage;
> @@ -1868,8 +1871,9 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>  		if (IS_ERR(page))
>  			goto set_status;
>  
> -		if (page && !is_zone_device_page(page)) {
> -			err = page_to_nid(page);
> +		if (page) {
> +			err = !is_zone_device_page(page) ? page_to_nid(page)
> +							 : -ENOENT;
>  			if (foll_flags & FOLL_GET)
>  				put_page(page);
>  		} else {
Felix Kuehling Aug. 15, 2022, 2:28 p.m. UTC | #2
Am 2022-08-15 um 03:02 schrieb Haiyue Wang:
> The handling Non-LRU pages returned by follow_page() jumps directly, it
> doesn't call put_page() to handle the reference count, since 'FOLL_GET'
> flag for follow_page() has get_page() called. Fix the zone device page
> check by handling the page reference count correctly before returning.
>
> And as David reviewed, "device pages are never PageKsm pages". Drop this
> zone device page check for break_ksm().
>
> Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
> Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>

Thank you for catching this. The patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   mm/huge_memory.c |  4 ++--
>   mm/ksm.c         | 12 +++++++++---
>   mm/migrate.c     | 10 +++++++---
>   3 files changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8a7c1b344abe..b2ba17c3dcd7 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2963,10 +2963,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>   		/* FOLL_DUMP to ignore special (like zero) pages */
>   		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>   
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>   			continue;
>   
> -		if (!is_transparent_hugepage(page))
> +		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
>   			goto next;
>   
>   		total++;
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 42ab153335a2..e26f57fc1f0e 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>   		cond_resched();
>   		page = follow_page(vma, addr,
>   				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>   			break;
>   		if (PageKsm(page))
>   			ret = handle_mm_fault(vma, addr,
> @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>   		goto out;
>   
>   	page = follow_page(vma, addr, FOLL_GET);
> -	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +	if (IS_ERR_OR_NULL(page))
>   		goto out;
> +	if (is_zone_device_page(page))
> +		goto out_putpage;
>   	if (PageAnon(page)) {
>   		flush_anon_page(vma, page, addr);
>   		flush_dcache_page(page);
>   	} else {
> +out_putpage:
>   		put_page(page);
>   out:
>   		page = NULL;
> @@ -2308,11 +2311,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>   			if (ksm_test_exit(mm))
>   				break;
>   			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
> -			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
> +			if (IS_ERR_OR_NULL(*page)) {
>   				ksm_scan.address += PAGE_SIZE;
>   				cond_resched();
>   				continue;
>   			}
> +			if (is_zone_device_page(*page))
> +				goto next_page;
>   			if (PageAnon(*page)) {
>   				flush_anon_page(vma, *page, ksm_scan.address);
>   				flush_dcache_page(*page);
> @@ -2327,6 +2332,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>   				mmap_read_unlock(mm);
>   				return rmap_item;
>   			}
> +next_page:
>   			put_page(*page);
>   			ksm_scan.address += PAGE_SIZE;
>   			cond_resched();
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 581dfaad9257..fee12cd2f294 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1672,9 +1672,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>   		goto out;
>   
>   	err = -ENOENT;
> -	if (!page || is_zone_device_page(page))
> +	if (!page)
>   		goto out;
>   
> +	if (is_zone_device_page(page))
> +		goto out_putpage;
> +
>   	err = 0;
>   	if (page_to_nid(page) == node)
>   		goto out_putpage;
> @@ -1868,8 +1871,9 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>   		if (IS_ERR(page))
>   			goto set_status;
>   
> -		if (page && !is_zone_device_page(page)) {
> -			err = page_to_nid(page);
> +		if (page) {
> +			err = !is_zone_device_page(page) ? page_to_nid(page)
> +							 : -ENOENT;
>   			if (foll_flags & FOLL_GET)
>   				put_page(page);
>   		} else {
Alistair Popple Aug. 16, 2022, midnight UTC | #3
Haiyue Wang <haiyue.wang@intel.com> writes:

> The handling Non-LRU pages returned by follow_page() jumps directly, it
> doesn't call put_page() to handle the reference count, since 'FOLL_GET'
> flag for follow_page() has get_page() called. Fix the zone device page
> check by handling the page reference count correctly before returning.
>
> And as David reviewed, "device pages are never PageKsm pages". Drop this
> zone device page check for break_ksm().
>
> Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
> Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> ---
>  mm/huge_memory.c |  4 ++--
>  mm/ksm.c         | 12 +++++++++---
>  mm/migrate.c     | 10 +++++++---
>  3 files changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8a7c1b344abe..b2ba17c3dcd7 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2963,10 +2963,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  		/* FOLL_DUMP to ignore special (like zero) pages */
>  		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>  			continue;
>
> -		if (!is_transparent_hugepage(page))
> +		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
>  			goto next;
>
>  		total++;
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 42ab153335a2..e26f57fc1f0e 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>  		cond_resched();
>  		page = follow_page(vma, addr,
>  				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +		if (IS_ERR_OR_NULL(page))
>  			break;
>  		if (PageKsm(page))
>  			ret = handle_mm_fault(vma, addr,
> @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>  		goto out;
>
>  	page = follow_page(vma, addr, FOLL_GET);
> -	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> +	if (IS_ERR_OR_NULL(page))
>  		goto out;
> +	if (is_zone_device_page(page))

Same as for break_ksm() I think we should be able to drop the
is_zone_device_page() check here because scan_get_next_rmap_item()
already filters out zone device pages.

> +		goto out_putpage;
>  	if (PageAnon(page)) {
>  		flush_anon_page(vma, page, addr);
>  		flush_dcache_page(page);
>  	} else {
> +out_putpage:
>  		put_page(page);
>  out:
>  		page = NULL;
> @@ -2308,11 +2311,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  			if (ksm_test_exit(mm))
>  				break;
>  			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
> -			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
> +			if (IS_ERR_OR_NULL(*page)) {
>  				ksm_scan.address += PAGE_SIZE;
>  				cond_resched();
>  				continue;
>  			}
> +			if (is_zone_device_page(*page))
> +				goto next_page;
>  			if (PageAnon(*page)) {
>  				flush_anon_page(vma, *page, ksm_scan.address);
>  				flush_dcache_page(*page);
> @@ -2327,6 +2332,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>  				mmap_read_unlock(mm);
>  				return rmap_item;
>  			}
> +next_page:
>  			put_page(*page);
>  			ksm_scan.address += PAGE_SIZE;
>  			cond_resched();
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 581dfaad9257..fee12cd2f294 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1672,9 +1672,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>  		goto out;
>
>  	err = -ENOENT;
> -	if (!page || is_zone_device_page(page))
> +	if (!page)
>  		goto out;
>
> +	if (is_zone_device_page(page))
> +		goto out_putpage;
> +
>  	err = 0;
>  	if (page_to_nid(page) == node)
>  		goto out_putpage;
> @@ -1868,8 +1871,9 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>  		if (IS_ERR(page))
>  			goto set_status;
>
> -		if (page && !is_zone_device_page(page)) {
> -			err = page_to_nid(page);
> +		if (page) {
> +			err = !is_zone_device_page(page) ? page_to_nid(page)
> +							 : -ENOENT;

Can we remove the multiple layers of conditionals here? Something like
this is cleaner and easier to understand IMHO:

-               if (page && !is_zone_device_page(page)) {
-                       err = page_to_nid(page);
-                       if (foll_flags & FOLL_GET)
-                               put_page(page);
-               } else {
+               if (!page) {
                        err = -ENOENT;
+                       goto set_status;
                }
+
+               if (is_zone_device_page(page))
+                       err = -ENOENT;
+               else
+                       err = page_to_nid_page(page);
+
+               if (foll_flags & FOLL_GET)
+                       put_page(page);

Thanks.

 - Alistair

>  			if (foll_flags & FOLL_GET)
>  				put_page(page);
>  		} else {
Haiyue Wang Aug. 16, 2022, 1:12 a.m. UTC | #4
> -----Original Message-----
> From: Alistair Popple <apopple@nvidia.com>
> Sent: Tuesday, August 16, 2022 08:01
> To: Wang, Haiyue <haiyue.wang@intel.com>
> Cc: linux-mm@kvack.org; linux-kernel@vger.kernel.org; akpm@linux-foundation.org; david@redhat.com;
> linmiaohe@huawei.com; Huang, Ying <ying.huang@intel.com>; songmuchun@bytedance.com;
> naoya.horiguchi@linux.dev; alex.sierra@amd.com; Felix Kuehling <Felix.Kuehling@amd.com>
> Subject: Re: [PATCH v5 2/2] mm: fix the handling Non-LRU pages returned by follow_page
> 
> 
> Haiyue Wang <haiyue.wang@intel.com> writes:
> 
> > The handling Non-LRU pages returned by follow_page() jumps directly, it
> > doesn't call put_page() to handle the reference count, since 'FOLL_GET'
> > flag for follow_page() has get_page() called. Fix the zone device page
> > check by handling the page reference count correctly before returning.
> >
> > And as David reviewed, "device pages are never PageKsm pages". Drop this
> > zone device page check for break_ksm().
> >
> > Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
> > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> > ---
> >  mm/huge_memory.c |  4 ++--
> >  mm/ksm.c         | 12 +++++++++---
> >  mm/migrate.c     | 10 +++++++---
> >  3 files changed, 18 insertions(+), 8 deletions(-)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 8a7c1b344abe..b2ba17c3dcd7 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2963,10 +2963,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> >  		/* FOLL_DUMP to ignore special (like zero) pages */
> >  		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> >
> > -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> > +		if (IS_ERR_OR_NULL(page))
> >  			continue;
> >
> > -		if (!is_transparent_hugepage(page))
> > +		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
> >  			goto next;
> >
> >  		total++;
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 42ab153335a2..e26f57fc1f0e 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
> >  		cond_resched();
> >  		page = follow_page(vma, addr,
> >  				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> > -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> > +		if (IS_ERR_OR_NULL(page))
> >  			break;
> >  		if (PageKsm(page))
> >  			ret = handle_mm_fault(vma, addr,
> > @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
> >  		goto out;
> >
> >  	page = follow_page(vma, addr, FOLL_GET);
> > -	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
> > +	if (IS_ERR_OR_NULL(page))
> >  		goto out;
> > +	if (is_zone_device_page(page))
> 
> Same as for break_ksm() I think we should be able to drop the
> is_zone_device_page() check here because scan_get_next_rmap_item()
> already filters out zone device pages.
> 

The 'page' for scan_get_next_rmap_item() is from 'vma' which is NOT MERGEABLE:
	for (; vma; vma = vma->vm_next) {
		if (!(vma->vm_flags & VM_MERGEABLE))
			continue;

The 'page' for get_mergeable_page() is from 'vma' which is MERGEABLE by 'find_mergeable_vma()'

So they may be different, and the unstable_tree_search_insert() shows the logical:

 'page' vs 'tree_page':

		tree_page = get_mergeable_page(tree_rmap_item);
		if (!tree_page)
			return NULL;

		/*
		 * Don't substitute a ksm page for a forked page.
		 */
		if (page == tree_page) {
			put_page(tree_page);
			return NULL;
		}

		ret = memcmp_pages(page, tree_page);


> > +		goto out_putpage;
> >  	if (PageAnon(page)) {
> >  		flush_anon_page(vma, page, addr);
> >  		flush_dcache_page(page);
> >  	} else {
> > +out_putpage:
> >  		put_page(page);
> >  out:
> >  		page = NULL;
> > @@ -2308,11 +2311,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
> >  			if (ksm_test_exit(mm))
> >  				break;
> >  			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
> > -			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
> > +			if (IS_ERR_OR_NULL(*page)) {
> >  				ksm_scan.address += PAGE_SIZE;
> >  				cond_resched();
> >  				continue;
> >  			}
> > +			if (is_zone_device_page(*page))
> > +				goto next_page;
> >  			if (PageAnon(*page)) {
> >  				flush_anon_page(vma, *page, ksm_scan.address);
> >  				flush_dcache_page(*page);
> > @@ -2327,6 +2332,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
> >  				mmap_read_unlock(mm);
> >  				return rmap_item;
> >  			}
> > +next_page:
> >  			put_page(*page);
> >  			ksm_scan.address += PAGE_SIZE;
> >  			cond_resched();
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 581dfaad9257..fee12cd2f294 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -1672,9 +1672,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
> >  		goto out;
> >
> >  	err = -ENOENT;
> > -	if (!page || is_zone_device_page(page))
> > +	if (!page)
> >  		goto out;
> >
> > +	if (is_zone_device_page(page))
> > +		goto out_putpage;
> > +
> >  	err = 0;
> >  	if (page_to_nid(page) == node)
> >  		goto out_putpage;
> > @@ -1868,8 +1871,9 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
> >  		if (IS_ERR(page))
> >  			goto set_status;
> >
> > -		if (page && !is_zone_device_page(page)) {
> > -			err = page_to_nid(page);
> > +		if (page) {
> > +			err = !is_zone_device_page(page) ? page_to_nid(page)
> > +							 : -ENOENT;
> 
> Can we remove the multiple layers of conditionals here? Something like
> this is cleaner and easier to understand IMHO:

OK, I will try it in new patch.

> 
> -               if (page && !is_zone_device_page(page)) {
> -                       err = page_to_nid(page);
> -                       if (foll_flags & FOLL_GET)
> -                               put_page(page);
> -               } else {
> +               if (!page) {
>                         err = -ENOENT;
> +                       goto set_status;
>                 }
> +
> +               if (is_zone_device_page(page))
> +                       err = -ENOENT;
> +               else
> +                       err = page_to_nid_page(page);
> +
> +               if (foll_flags & FOLL_GET)
> +                       put_page(page);
> 
> Thanks.
> 
>  - Alistair
> 
> >  			if (foll_flags & FOLL_GET)
> >  				put_page(page);
> >  		} else {
Alistair Popple Aug. 16, 2022, 2:45 a.m. UTC | #5
"Wang, Haiyue" <haiyue.wang@intel.com> writes:

>> -----Original Message-----
>> From: Alistair Popple <apopple@nvidia.com>
>> Sent: Tuesday, August 16, 2022 08:01
>> To: Wang, Haiyue <haiyue.wang@intel.com>
>> Cc: linux-mm@kvack.org; linux-kernel@vger.kernel.org; akpm@linux-foundation.org; david@redhat.com;
>> linmiaohe@huawei.com; Huang, Ying <ying.huang@intel.com>; songmuchun@bytedance.com;
>> naoya.horiguchi@linux.dev; alex.sierra@amd.com; Felix Kuehling <Felix.Kuehling@amd.com>
>> Subject: Re: [PATCH v5 2/2] mm: fix the handling Non-LRU pages returned by follow_page
>>
>>
>> Haiyue Wang <haiyue.wang@intel.com> writes:
>>
>> > The handling Non-LRU pages returned by follow_page() jumps directly, it
>> > doesn't call put_page() to handle the reference count, since 'FOLL_GET'
>> > flag for follow_page() has get_page() called. Fix the zone device page
>> > check by handling the page reference count correctly before returning.
>> >
>> > And as David reviewed, "device pages are never PageKsm pages". Drop this
>> > zone device page check for break_ksm().
>> >
>> > Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages")
>> > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
>> > ---
>> >  mm/huge_memory.c |  4 ++--
>> >  mm/ksm.c         | 12 +++++++++---
>> >  mm/migrate.c     | 10 +++++++---
>> >  3 files changed, 18 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> > index 8a7c1b344abe..b2ba17c3dcd7 100644
>> > --- a/mm/huge_memory.c
>> > +++ b/mm/huge_memory.c
>> > @@ -2963,10 +2963,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>> >  		/* FOLL_DUMP to ignore special (like zero) pages */
>> >  		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> >
>> > -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
>> > +		if (IS_ERR_OR_NULL(page))
>> >  			continue;
>> >
>> > -		if (!is_transparent_hugepage(page))
>> > +		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
>> >  			goto next;
>> >
>> >  		total++;
>> > diff --git a/mm/ksm.c b/mm/ksm.c
>> > index 42ab153335a2..e26f57fc1f0e 100644
>> > --- a/mm/ksm.c
>> > +++ b/mm/ksm.c
>> > @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>> >  		cond_resched();
>> >  		page = follow_page(vma, addr,
>> >  				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>> > -		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
>> > +		if (IS_ERR_OR_NULL(page))
>> >  			break;
>> >  		if (PageKsm(page))
>> >  			ret = handle_mm_fault(vma, addr,
>> > @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>> >  		goto out;
>> >
>> >  	page = follow_page(vma, addr, FOLL_GET);
>> > -	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
>> > +	if (IS_ERR_OR_NULL(page))
>> >  		goto out;
>> > +	if (is_zone_device_page(page))
>>
>> Same as for break_ksm() I think we should be able to drop the
>> is_zone_device_page() check here because scan_get_next_rmap_item()
>> already filters out zone device pages.
>>
>
> The 'page' for scan_get_next_rmap_item() is from 'vma' which is NOT MERGEABLE:
> 	for (; vma; vma = vma->vm_next) {
> 		if (!(vma->vm_flags & VM_MERGEABLE))
> 			continue;
>
> The 'page' for get_mergeable_page() is from 'vma' which is MERGEABLE by 'find_mergeable_vma()'

Oh, ok. I'm actually not too familiar with KSM but I think I follow so
if you think we need to keep the check by all means do so.

> So they may be different, and the unstable_tree_search_insert() shows the logical:
>
>  'page' vs 'tree_page':
>
> 		tree_page = get_mergeable_page(tree_rmap_item);
> 		if (!tree_page)
> 			return NULL;
>
> 		/*
> 		 * Don't substitute a ksm page for a forked page.
> 		 */
> 		if (page == tree_page) {
> 			put_page(tree_page);
> 			return NULL;
> 		}
>
> 		ret = memcmp_pages(page, tree_page);
>
>
>> > +		goto out_putpage;
>> >  	if (PageAnon(page)) {
>> >  		flush_anon_page(vma, page, addr);
>> >  		flush_dcache_page(page);
>> >  	} else {
>> > +out_putpage:
>> >  		put_page(page);
>> >  out:
>> >  		page = NULL;
>> > @@ -2308,11 +2311,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>> >  			if (ksm_test_exit(mm))
>> >  				break;
>> >  			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>> > -			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
>> > +			if (IS_ERR_OR_NULL(*page)) {
>> >  				ksm_scan.address += PAGE_SIZE;
>> >  				cond_resched();
>> >  				continue;
>> >  			}
>> > +			if (is_zone_device_page(*page))
>> > +				goto next_page;
>> >  			if (PageAnon(*page)) {
>> >  				flush_anon_page(vma, *page, ksm_scan.address);
>> >  				flush_dcache_page(*page);
>> > @@ -2327,6 +2332,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>> >  				mmap_read_unlock(mm);
>> >  				return rmap_item;
>> >  			}
>> > +next_page:
>> >  			put_page(*page);
>> >  			ksm_scan.address += PAGE_SIZE;
>> >  			cond_resched();
>> > diff --git a/mm/migrate.c b/mm/migrate.c
>> > index 581dfaad9257..fee12cd2f294 100644
>> > --- a/mm/migrate.c
>> > +++ b/mm/migrate.c
>> > @@ -1672,9 +1672,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>> >  		goto out;
>> >
>> >  	err = -ENOENT;
>> > -	if (!page || is_zone_device_page(page))
>> > +	if (!page)
>> >  		goto out;
>> >
>> > +	if (is_zone_device_page(page))
>> > +		goto out_putpage;
>> > +
>> >  	err = 0;
>> >  	if (page_to_nid(page) == node)
>> >  		goto out_putpage;
>> > @@ -1868,8 +1871,9 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>> >  		if (IS_ERR(page))
>> >  			goto set_status;
>> >
>> > -		if (page && !is_zone_device_page(page)) {
>> > -			err = page_to_nid(page);
>> > +		if (page) {
>> > +			err = !is_zone_device_page(page) ? page_to_nid(page)
>> > +							 : -ENOENT;
>>
>> Can we remove the multiple layers of conditionals here? Something like
>> this is cleaner and easier to understand IMHO:
>
> OK, I will try it in new patch.

Thanks.

>>
>> -               if (page && !is_zone_device_page(page)) {
>> -                       err = page_to_nid(page);
>> -                       if (foll_flags & FOLL_GET)
>> -                               put_page(page);
>> -               } else {
>> +               if (!page) {
>>                         err = -ENOENT;
>> +                       goto set_status;
>>                 }
>> +
>> +               if (is_zone_device_page(page))
>> +                       err = -ENOENT;
>> +               else
>> +                       err = page_to_nid_page(page);
>> +
>> +               if (foll_flags & FOLL_GET)
>> +                       put_page(page);
>>
>> Thanks.
>>
>>  - Alistair
>>
>> >  			if (foll_flags & FOLL_GET)
>> >  				put_page(page);
>> >  		} else {
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..b2ba17c3dcd7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2963,10 +2963,10 @@  static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 		/* FOLL_DUMP to ignore special (like zero) pages */
 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
 
-		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+		if (IS_ERR_OR_NULL(page))
 			continue;
 
-		if (!is_transparent_hugepage(page))
+		if (is_zone_device_page(page) || !is_transparent_hugepage(page))
 			goto next;
 
 		total++;
diff --git a/mm/ksm.c b/mm/ksm.c
index 42ab153335a2..e26f57fc1f0e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@  static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		cond_resched();
 		page = follow_page(vma, addr,
 				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+		if (IS_ERR_OR_NULL(page))
 			break;
 		if (PageKsm(page))
 			ret = handle_mm_fault(vma, addr,
@@ -560,12 +560,15 @@  static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 		goto out;
 
 	page = follow_page(vma, addr, FOLL_GET);
-	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+	if (IS_ERR_OR_NULL(page))
 		goto out;
+	if (is_zone_device_page(page))
+		goto out_putpage;
 	if (PageAnon(page)) {
 		flush_anon_page(vma, page, addr);
 		flush_dcache_page(page);
 	} else {
+out_putpage:
 		put_page(page);
 out:
 		page = NULL;
@@ -2308,11 +2311,13 @@  static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 			if (ksm_test_exit(mm))
 				break;
 			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
+			if (IS_ERR_OR_NULL(*page)) {
 				ksm_scan.address += PAGE_SIZE;
 				cond_resched();
 				continue;
 			}
+			if (is_zone_device_page(*page))
+				goto next_page;
 			if (PageAnon(*page)) {
 				flush_anon_page(vma, *page, ksm_scan.address);
 				flush_dcache_page(*page);
@@ -2327,6 +2332,7 @@  static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 				mmap_read_unlock(mm);
 				return rmap_item;
 			}
+next_page:
 			put_page(*page);
 			ksm_scan.address += PAGE_SIZE;
 			cond_resched();
diff --git a/mm/migrate.c b/mm/migrate.c
index 581dfaad9257..fee12cd2f294 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1672,9 +1672,12 @@  static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 		goto out;
 
 	err = -ENOENT;
-	if (!page || is_zone_device_page(page))
+	if (!page)
 		goto out;
 
+	if (is_zone_device_page(page))
+		goto out_putpage;
+
 	err = 0;
 	if (page_to_nid(page) == node)
 		goto out_putpage;
@@ -1868,8 +1871,9 @@  static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		if (IS_ERR(page))
 			goto set_status;
 
-		if (page && !is_zone_device_page(page)) {
-			err = page_to_nid(page);
+		if (page) {
+			err = !is_zone_device_page(page) ? page_to_nid(page)
+							 : -ENOENT;
 			if (foll_flags & FOLL_GET)
 				put_page(page);
 		} else {