diff mbox series

[RFC,2/6] mm/migrate_pages: split unmap_and_move() to _unmap() and _move()

Message ID 20220921060616.73086-3-ying.huang@intel.com (mailing list archive)
State New
Headers show
Series migrate_pages(): batch TLB flushing | expand

Commit Message

Huang, Ying Sept. 21, 2022, 6:06 a.m. UTC
This is a preparation patch to batch the page unmapping and moving
for the normal pages and THP.

In this patch, unmap_and_move() is split to migrate_page_unmap() and
migrate_page_move().  So, we can batch _unmap() and _move() in
different loops later.  To pass some information between unmap and
move, the original unused newpage->mapping and newpage->private are
used.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
---
 mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 122 insertions(+), 42 deletions(-)

Comments

Zi Yan Sept. 21, 2022, 4:08 p.m. UTC | #1
On 21 Sep 2022, at 2:06, Huang Ying wrote:

> This is a preparation patch to batch the page unmapping and moving
> for the normal pages and THP.
>
> In this patch, unmap_and_move() is split to migrate_page_unmap() and
> migrate_page_move().  So, we can batch _unmap() and _move() in
> different loops later.  To pass some information between unmap and
> move, the original unused newpage->mapping and newpage->private are
> used.
>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Matthew Wilcox <willy@infradead.org>
> ---
>  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 122 insertions(+), 42 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 117134f1c6dc..4a81e0bfdbcd 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>  	return rc;
>  }
>
> -static int __unmap_and_move(struct page *page, struct page *newpage,
> +static void __migrate_page_record(struct page *newpage,
> +				  int page_was_mapped,
> +				  struct anon_vma *anon_vma)
> +{
> +	newpage->mapping = (struct address_space *)anon_vma;
> +	newpage->private = page_was_mapped;
> +}
> +
> +static void __migrate_page_extract(struct page *newpage,
> +				   int *page_was_mappedp,
> +				   struct anon_vma **anon_vmap)
> +{
> +	*anon_vmap = (struct anon_vma *)newpage->mapping;
> +	*page_was_mappedp = newpage->private;
> +	newpage->mapping = NULL;
> +	newpage->private = 0;
> +}
> +
> +#define MIGRATEPAGE_UNMAP		1

It is better to move this to migrate.h with MIGRATEPAGE_SUCCESS and
make them an enum.

> +
> +static int __migrate_page_unmap(struct page *page, struct page *newpage,
>  				int force, enum migrate_mode mode)
>  {
>  	struct folio *folio = page_folio(page);
> -	struct folio *dst = page_folio(newpage);
>  	int rc = -EAGAIN;
> -	bool page_was_mapped = false;
> +	int page_was_mapped = 0;
>  	struct anon_vma *anon_vma = NULL;
>  	bool is_lru = !__PageMovable(page);
>
> @@ -1058,8 +1077,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		goto out_unlock;
>
>  	if (unlikely(!is_lru)) {
> -		rc = move_to_new_folio(dst, folio, mode);
> -		goto out_unlock_both;
> +		__migrate_page_record(newpage, page_was_mapped, anon_vma);
> +		return MIGRATEPAGE_UNMAP;
>  	}
>
>  	/*
> @@ -1085,11 +1104,41 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
>  				page);
>  		try_to_migrate(folio, 0);
> -		page_was_mapped = true;
> +		page_was_mapped = 1;
> +	}
> +
> +	if (!page_mapped(page)) {
> +		__migrate_page_record(newpage, page_was_mapped, anon_vma);
> +		return MIGRATEPAGE_UNMAP;
>  	}
>
> -	if (!page_mapped(page))
> -		rc = move_to_new_folio(dst, folio, mode);
> +	if (page_was_mapped)
> +		remove_migration_ptes(folio, folio, false);
> +
> +out_unlock_both:
> +	unlock_page(newpage);
> +out_unlock:
> +	/* Drop an anon_vma reference if we took one */
> +	if (anon_vma)
> +		put_anon_vma(anon_vma);
> +	unlock_page(page);
> +out:
> +
> +	return rc;
> +}
> +
> +static int __migrate_page_move(struct page *page, struct page *newpage,
> +			       enum migrate_mode mode)
> +{
> +	struct folio *folio = page_folio(page);
> +	struct folio *dst = page_folio(newpage);
> +	int rc;
> +	int page_was_mapped = 0;
> +	struct anon_vma *anon_vma = NULL;
> +
> +	__migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
> +
> +	rc = move_to_new_folio(dst, folio, mode);
>
>  	/*
>  	 * When successful, push newpage to LRU immediately: so that if it
> @@ -1110,14 +1159,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		remove_migration_ptes(folio,
>  			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
>
> -out_unlock_both:
>  	unlock_page(newpage);
> -out_unlock:
>  	/* Drop an anon_vma reference if we took one */
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
>  	unlock_page(page);
> -out:
>  	/*
>  	 * If migration is successful, decrease refcount of the newpage,
>  	 * which will not free the page because new page owner increased
> @@ -1129,18 +1175,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  	return rc;
>  }
>
> -/*
> - * Obtain the lock on page, remove all ptes and migrate the page
> - * to the newly allocated page in newpage.
> - */
> -static int unmap_and_move(new_page_t get_new_page,
> -				   free_page_t put_new_page,
> -				   unsigned long private, struct page *page,
> -				   int force, enum migrate_mode mode,
> -				   enum migrate_reason reason,
> -				   struct list_head *ret)
> +static void migrate_page_done(struct page *page,
> +			      enum migrate_reason reason)
> +{
> +	/*
> +	 * Compaction can migrate also non-LRU pages which are
> +	 * not accounted to NR_ISOLATED_*. They can be recognized
> +	 * as __PageMovable
> +	 */
> +	if (likely(!__PageMovable(page)))
> +		mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> +				    page_is_file_lru(page), -thp_nr_pages(page));
> +
> +	if (reason != MR_MEMORY_FAILURE)
> +		/* We release the page in page_handle_poison. */
> +		put_page(page);
> +}
> +
> +/* Obtain the lock on page, remove all ptes. */
> +static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
> +			      unsigned long private, struct page *page,
> +			      struct page **newpagep, int force,
> +			      enum migrate_mode mode, enum migrate_reason reason,
> +			      struct list_head *ret)
>  {
> -	int rc = MIGRATEPAGE_SUCCESS;
> +	int rc = MIGRATEPAGE_UNMAP;
>  	struct page *newpage = NULL;
>
>  	if (!thp_migration_supported() && PageTransHuge(page))
> @@ -1151,19 +1210,48 @@ static int unmap_and_move(new_page_t get_new_page,
>  		ClearPageActive(page);
>  		ClearPageUnevictable(page);
>  		/* free_pages_prepare() will clear PG_isolated. */
> -		goto out;
> +		list_del(&page->lru);
> +		migrate_page_done(page, reason);
> +		return MIGRATEPAGE_SUCCESS;
>  	}
>
>  	newpage = get_new_page(page, private);
>  	if (!newpage)
>  		return -ENOMEM;
> +	*newpagep = newpage;
>
> -	newpage->private = 0;
> -	rc = __unmap_and_move(page, newpage, force, mode);
> +	rc = __migrate_page_unmap(page, newpage, force, mode);
> +	if (rc == MIGRATEPAGE_UNMAP)
> +		return rc;
> +
> +	/*
> +	 * A page that has not been migrated will have kept its
> +	 * references and be restored.
> +	 */
> +	/* restore the page to right list. */
> +	if (rc != -EAGAIN)
> +		list_move_tail(&page->lru, ret);
> +
> +	if (put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		put_page(newpage);
> +
> +	return rc;
> +}
> +
> +/* Migrate the page to the newly allocated page in newpage. */
> +static int migrate_page_move(free_page_t put_new_page, unsigned long private,
> +			     struct page *page, struct page *newpage,
> +			     enum migrate_mode mode, enum migrate_reason reason,
> +			     struct list_head *ret)
> +{
> +	int rc;
> +
> +	rc = __migrate_page_move(page, newpage, mode);
>  	if (rc == MIGRATEPAGE_SUCCESS)
>  		set_page_owner_migrate_reason(newpage, reason);
>
> -out:
>  	if (rc != -EAGAIN) {
>  		/*
>  		 * A page that has been migrated has all references
> @@ -1179,20 +1267,7 @@ static int unmap_and_move(new_page_t get_new_page,
>  	 * we want to retry.
>  	 */
>  	if (rc == MIGRATEPAGE_SUCCESS) {
> -		/*
> -		 * Compaction can migrate also non-LRU pages which are
> -		 * not accounted to NR_ISOLATED_*. They can be recognized
> -		 * as __PageMovable
> -		 */
> -		if (likely(!__PageMovable(page)))
> -			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> -					page_is_file_lru(page), -thp_nr_pages(page));
> -
> -		if (reason != MR_MEMORY_FAILURE)
> -			/*
> -			 * We release the page in page_handle_poison.
> -			 */
> -			put_page(page);
> +		migrate_page_done(page, reason);
>  	} else {
>  		if (rc != -EAGAIN)
>  			list_add_tail(&page->lru, ret);
> @@ -1405,6 +1480,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>  	int pass = 0;
>  	bool is_thp = false;
>  	struct page *page;
> +	struct page *newpage = NULL;
>  	struct page *page2;
>  	int rc, nr_subpages;
>  	LIST_HEAD(ret_pages);
> @@ -1493,9 +1569,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>  			if (PageHuge(page))
>  				continue;
>
> -			rc = unmap_and_move(get_new_page, put_new_page,
> -						private, page, pass > 2, mode,
> +			rc = migrate_page_unmap(get_new_page, put_new_page, private,
> +						page, &newpage, pass > 2, mode,
>  						reason, &ret_pages);
> +			if (rc == MIGRATEPAGE_UNMAP)
> +				rc = migrate_page_move(put_new_page, private,
> +						       page, newpage, mode,
> +						       reason, &ret_pages);
>  			/*
>  			 * The rules are:
>  			 *	Success: page will be freed
> -- 
> 2.35.1


--
Best Regards,
Yan, Zi
Huang, Ying Sept. 22, 2022, 1:15 a.m. UTC | #2
Zi Yan <ziy@nvidia.com> writes:

> On 21 Sep 2022, at 2:06, Huang Ying wrote:
>
>> This is a preparation patch to batch the page unmapping and moving
>> for the normal pages and THP.
>>
>> In this patch, unmap_and_move() is split to migrate_page_unmap() and
>> migrate_page_move().  So, we can batch _unmap() and _move() in
>> different loops later.  To pass some information between unmap and
>> move, the original unused newpage->mapping and newpage->private are
>> used.
>>
>> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Yang Shi <shy828301@gmail.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> ---
>>  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
>>  1 file changed, 122 insertions(+), 42 deletions(-)
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 117134f1c6dc..4a81e0bfdbcd 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>>  	return rc;
>>  }
>>
>> -static int __unmap_and_move(struct page *page, struct page *newpage,
>> +static void __migrate_page_record(struct page *newpage,
>> +				  int page_was_mapped,
>> +				  struct anon_vma *anon_vma)
>> +{
>> +	newpage->mapping = (struct address_space *)anon_vma;
>> +	newpage->private = page_was_mapped;
>> +}
>> +
>> +static void __migrate_page_extract(struct page *newpage,
>> +				   int *page_was_mappedp,
>> +				   struct anon_vma **anon_vmap)
>> +{
>> +	*anon_vmap = (struct anon_vma *)newpage->mapping;
>> +	*page_was_mappedp = newpage->private;
>> +	newpage->mapping = NULL;
>> +	newpage->private = 0;
>> +}
>> +
>> +#define MIGRATEPAGE_UNMAP		1
>
> It is better to move this to migrate.h with MIGRATEPAGE_SUCCESS and
> make them an enum.

Make sense!  Will do this in the next version.

Best Regards,
Huang, Ying
Baolin Wang Sept. 22, 2022, 6:36 a.m. UTC | #3
On 9/21/2022 2:06 PM, Huang Ying wrote:
> This is a preparation patch to batch the page unmapping and moving
> for the normal pages and THP.
> 
> In this patch, unmap_and_move() is split to migrate_page_unmap() and
> migrate_page_move().  So, we can batch _unmap() and _move() in
> different loops later.  To pass some information between unmap and
> move, the original unused newpage->mapping and newpage->private are
> used.
> 
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com> > Cc: Zi Yan <ziy@nvidia.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Matthew Wilcox <willy@infradead.org>
> ---

Looks good to me.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Alistair Popple Sept. 26, 2022, 9:28 a.m. UTC | #4
Huang Ying <ying.huang@intel.com> writes:

> This is a preparation patch to batch the page unmapping and moving
> for the normal pages and THP.
>
> In this patch, unmap_and_move() is split to migrate_page_unmap() and
> migrate_page_move().  So, we can batch _unmap() and _move() in
> different loops later.  To pass some information between unmap and
> move, the original unused newpage->mapping and newpage->private are
> used.

This looks like it could cause a deadlock between two threads migrating
the same pages if force == true && mode != MIGRATE_ASYNC as
migrate_page_unmap() will call lock_page() while holding the lock on
other pages in the list. Therefore the two threads could deadlock if the
pages are in a different order.

> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Matthew Wilcox <willy@infradead.org>
> ---
>  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 122 insertions(+), 42 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 117134f1c6dc..4a81e0bfdbcd 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>  	return rc;
>  }
>
> -static int __unmap_and_move(struct page *page, struct page *newpage,
> +static void __migrate_page_record(struct page *newpage,
> +				  int page_was_mapped,
> +				  struct anon_vma *anon_vma)
> +{
> +	newpage->mapping = (struct address_space *)anon_vma;
> +	newpage->private = page_was_mapped;
> +}
> +
> +static void __migrate_page_extract(struct page *newpage,
> +				   int *page_was_mappedp,
> +				   struct anon_vma **anon_vmap)
> +{
> +	*anon_vmap = (struct anon_vma *)newpage->mapping;
> +	*page_was_mappedp = newpage->private;
> +	newpage->mapping = NULL;
> +	newpage->private = 0;
> +}
> +
> +#define MIGRATEPAGE_UNMAP		1
> +
> +static int __migrate_page_unmap(struct page *page, struct page *newpage,
>  				int force, enum migrate_mode mode)
>  {
>  	struct folio *folio = page_folio(page);
> -	struct folio *dst = page_folio(newpage);
>  	int rc = -EAGAIN;
> -	bool page_was_mapped = false;
> +	int page_was_mapped = 0;
>  	struct anon_vma *anon_vma = NULL;
>  	bool is_lru = !__PageMovable(page);
>
> @@ -1058,8 +1077,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		goto out_unlock;
>
>  	if (unlikely(!is_lru)) {
> -		rc = move_to_new_folio(dst, folio, mode);
> -		goto out_unlock_both;
> +		__migrate_page_record(newpage, page_was_mapped, anon_vma);
> +		return MIGRATEPAGE_UNMAP;
>  	}
>
>  	/*
> @@ -1085,11 +1104,41 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
>  				page);
>  		try_to_migrate(folio, 0);
> -		page_was_mapped = true;
> +		page_was_mapped = 1;
> +	}
> +
> +	if (!page_mapped(page)) {
> +		__migrate_page_record(newpage, page_was_mapped, anon_vma);
> +		return MIGRATEPAGE_UNMAP;
>  	}
>
> -	if (!page_mapped(page))
> -		rc = move_to_new_folio(dst, folio, mode);
> +	if (page_was_mapped)
> +		remove_migration_ptes(folio, folio, false);
> +
> +out_unlock_both:
> +	unlock_page(newpage);
> +out_unlock:
> +	/* Drop an anon_vma reference if we took one */
> +	if (anon_vma)
> +		put_anon_vma(anon_vma);
> +	unlock_page(page);
> +out:
> +
> +	return rc;
> +}
> +
> +static int __migrate_page_move(struct page *page, struct page *newpage,
> +			       enum migrate_mode mode)
> +{
> +	struct folio *folio = page_folio(page);
> +	struct folio *dst = page_folio(newpage);
> +	int rc;
> +	int page_was_mapped = 0;
> +	struct anon_vma *anon_vma = NULL;
> +
> +	__migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
> +
> +	rc = move_to_new_folio(dst, folio, mode);
>
>  	/*
>  	 * When successful, push newpage to LRU immediately: so that if it
> @@ -1110,14 +1159,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		remove_migration_ptes(folio,
>  			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
>
> -out_unlock_both:
>  	unlock_page(newpage);
> -out_unlock:
>  	/* Drop an anon_vma reference if we took one */
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
>  	unlock_page(page);
> -out:
>  	/*
>  	 * If migration is successful, decrease refcount of the newpage,
>  	 * which will not free the page because new page owner increased
> @@ -1129,18 +1175,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  	return rc;
>  }
>
> -/*
> - * Obtain the lock on page, remove all ptes and migrate the page
> - * to the newly allocated page in newpage.
> - */
> -static int unmap_and_move(new_page_t get_new_page,
> -				   free_page_t put_new_page,
> -				   unsigned long private, struct page *page,
> -				   int force, enum migrate_mode mode,
> -				   enum migrate_reason reason,
> -				   struct list_head *ret)
> +static void migrate_page_done(struct page *page,
> +			      enum migrate_reason reason)
> +{
> +	/*
> +	 * Compaction can migrate also non-LRU pages which are
> +	 * not accounted to NR_ISOLATED_*. They can be recognized
> +	 * as __PageMovable
> +	 */
> +	if (likely(!__PageMovable(page)))
> +		mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> +				    page_is_file_lru(page), -thp_nr_pages(page));
> +
> +	if (reason != MR_MEMORY_FAILURE)
> +		/* We release the page in page_handle_poison. */
> +		put_page(page);
> +}
> +
> +/* Obtain the lock on page, remove all ptes. */
> +static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
> +			      unsigned long private, struct page *page,
> +			      struct page **newpagep, int force,
> +			      enum migrate_mode mode, enum migrate_reason reason,
> +			      struct list_head *ret)
>  {
> -	int rc = MIGRATEPAGE_SUCCESS;
> +	int rc = MIGRATEPAGE_UNMAP;
>  	struct page *newpage = NULL;
>
>  	if (!thp_migration_supported() && PageTransHuge(page))
> @@ -1151,19 +1210,48 @@ static int unmap_and_move(new_page_t get_new_page,
>  		ClearPageActive(page);
>  		ClearPageUnevictable(page);
>  		/* free_pages_prepare() will clear PG_isolated. */
> -		goto out;
> +		list_del(&page->lru);
> +		migrate_page_done(page, reason);
> +		return MIGRATEPAGE_SUCCESS;
>  	}
>
>  	newpage = get_new_page(page, private);
>  	if (!newpage)
>  		return -ENOMEM;
> +	*newpagep = newpage;
>
> -	newpage->private = 0;
> -	rc = __unmap_and_move(page, newpage, force, mode);
> +	rc = __migrate_page_unmap(page, newpage, force, mode);
> +	if (rc == MIGRATEPAGE_UNMAP)
> +		return rc;
> +
> +	/*
> +	 * A page that has not been migrated will have kept its
> +	 * references and be restored.
> +	 */
> +	/* restore the page to right list. */
> +	if (rc != -EAGAIN)
> +		list_move_tail(&page->lru, ret);
> +
> +	if (put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		put_page(newpage);
> +
> +	return rc;
> +}
> +
> +/* Migrate the page to the newly allocated page in newpage. */
> +static int migrate_page_move(free_page_t put_new_page, unsigned long private,
> +			     struct page *page, struct page *newpage,
> +			     enum migrate_mode mode, enum migrate_reason reason,
> +			     struct list_head *ret)
> +{
> +	int rc;
> +
> +	rc = __migrate_page_move(page, newpage, mode);
>  	if (rc == MIGRATEPAGE_SUCCESS)
>  		set_page_owner_migrate_reason(newpage, reason);
>
> -out:
>  	if (rc != -EAGAIN) {
>  		/*
>  		 * A page that has been migrated has all references
> @@ -1179,20 +1267,7 @@ static int unmap_and_move(new_page_t get_new_page,
>  	 * we want to retry.
>  	 */
>  	if (rc == MIGRATEPAGE_SUCCESS) {
> -		/*
> -		 * Compaction can migrate also non-LRU pages which are
> -		 * not accounted to NR_ISOLATED_*. They can be recognized
> -		 * as __PageMovable
> -		 */
> -		if (likely(!__PageMovable(page)))
> -			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> -					page_is_file_lru(page), -thp_nr_pages(page));
> -
> -		if (reason != MR_MEMORY_FAILURE)
> -			/*
> -			 * We release the page in page_handle_poison.
> -			 */
> -			put_page(page);
> +		migrate_page_done(page, reason);
>  	} else {
>  		if (rc != -EAGAIN)
>  			list_add_tail(&page->lru, ret);
> @@ -1405,6 +1480,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>  	int pass = 0;
>  	bool is_thp = false;
>  	struct page *page;
> +	struct page *newpage = NULL;
>  	struct page *page2;
>  	int rc, nr_subpages;
>  	LIST_HEAD(ret_pages);
> @@ -1493,9 +1569,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>  			if (PageHuge(page))
>  				continue;
>
> -			rc = unmap_and_move(get_new_page, put_new_page,
> -						private, page, pass > 2, mode,
> +			rc = migrate_page_unmap(get_new_page, put_new_page, private,
> +						page, &newpage, pass > 2, mode,
>  						reason, &ret_pages);
> +			if (rc == MIGRATEPAGE_UNMAP)
> +				rc = migrate_page_move(put_new_page, private,
> +						       page, newpage, mode,
> +						       reason, &ret_pages);
>  			/*
>  			 * The rules are:
>  			 *	Success: page will be freed
Yang Shi Sept. 26, 2022, 6:06 p.m. UTC | #5
On Mon, Sep 26, 2022 at 2:37 AM Alistair Popple <apopple@nvidia.com> wrote:
>
>
> Huang Ying <ying.huang@intel.com> writes:
>
> > This is a preparation patch to batch the page unmapping and moving
> > for the normal pages and THP.
> >
> > In this patch, unmap_and_move() is split to migrate_page_unmap() and
> > migrate_page_move().  So, we can batch _unmap() and _move() in
> > different loops later.  To pass some information between unmap and
> > move, the original unused newpage->mapping and newpage->private are
> > used.
>
> This looks like it could cause a deadlock between two threads migrating
> the same pages if force == true && mode != MIGRATE_ASYNC as
> migrate_page_unmap() will call lock_page() while holding the lock on
> other pages in the list. Therefore the two threads could deadlock if the
> pages are in a different order.

It seems unlikely to me since the page has to be isolated from lru
before migration. The isolating from lru is atomic, so the two threads
unlikely see the same pages on both lists.

But there might be other cases which may incur deadlock, for example,
filesystem writeback IIUC. Some filesystems may lock a bunch of pages
then write them back in a batch. The same pages may be on the
migration list and they are also dirty and seen by writeback. I'm not
sure whether I miss something that could prevent such a deadlock from
happening.

>
> > Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> > Cc: Zi Yan <ziy@nvidia.com>
> > Cc: Yang Shi <shy828301@gmail.com>
> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Cc: Oscar Salvador <osalvador@suse.de>
> > Cc: Matthew Wilcox <willy@infradead.org>
> > ---
> >  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 122 insertions(+), 42 deletions(-)
> >
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 117134f1c6dc..4a81e0bfdbcd 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
> >       return rc;
> >  }
> >
> > -static int __unmap_and_move(struct page *page, struct page *newpage,
> > +static void __migrate_page_record(struct page *newpage,
> > +                               int page_was_mapped,
> > +                               struct anon_vma *anon_vma)
> > +{
> > +     newpage->mapping = (struct address_space *)anon_vma;
> > +     newpage->private = page_was_mapped;
> > +}
> > +
> > +static void __migrate_page_extract(struct page *newpage,
> > +                                int *page_was_mappedp,
> > +                                struct anon_vma **anon_vmap)
> > +{
> > +     *anon_vmap = (struct anon_vma *)newpage->mapping;
> > +     *page_was_mappedp = newpage->private;
> > +     newpage->mapping = NULL;
> > +     newpage->private = 0;
> > +}
> > +
> > +#define MIGRATEPAGE_UNMAP            1
> > +
> > +static int __migrate_page_unmap(struct page *page, struct page *newpage,
> >                               int force, enum migrate_mode mode)
> >  {
> >       struct folio *folio = page_folio(page);
> > -     struct folio *dst = page_folio(newpage);
> >       int rc = -EAGAIN;
> > -     bool page_was_mapped = false;
> > +     int page_was_mapped = 0;
> >       struct anon_vma *anon_vma = NULL;
> >       bool is_lru = !__PageMovable(page);
> >
> > @@ -1058,8 +1077,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >               goto out_unlock;
> >
> >       if (unlikely(!is_lru)) {
> > -             rc = move_to_new_folio(dst, folio, mode);
> > -             goto out_unlock_both;
> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
> > +             return MIGRATEPAGE_UNMAP;
> >       }
> >
> >       /*
> > @@ -1085,11 +1104,41 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
> >                               page);
> >               try_to_migrate(folio, 0);
> > -             page_was_mapped = true;
> > +             page_was_mapped = 1;
> > +     }
> > +
> > +     if (!page_mapped(page)) {
> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
> > +             return MIGRATEPAGE_UNMAP;
> >       }
> >
> > -     if (!page_mapped(page))
> > -             rc = move_to_new_folio(dst, folio, mode);
> > +     if (page_was_mapped)
> > +             remove_migration_ptes(folio, folio, false);
> > +
> > +out_unlock_both:
> > +     unlock_page(newpage);
> > +out_unlock:
> > +     /* Drop an anon_vma reference if we took one */
> > +     if (anon_vma)
> > +             put_anon_vma(anon_vma);
> > +     unlock_page(page);
> > +out:
> > +
> > +     return rc;
> > +}
> > +
> > +static int __migrate_page_move(struct page *page, struct page *newpage,
> > +                            enum migrate_mode mode)
> > +{
> > +     struct folio *folio = page_folio(page);
> > +     struct folio *dst = page_folio(newpage);
> > +     int rc;
> > +     int page_was_mapped = 0;
> > +     struct anon_vma *anon_vma = NULL;
> > +
> > +     __migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
> > +
> > +     rc = move_to_new_folio(dst, folio, mode);
> >
> >       /*
> >        * When successful, push newpage to LRU immediately: so that if it
> > @@ -1110,14 +1159,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >               remove_migration_ptes(folio,
> >                       rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
> >
> > -out_unlock_both:
> >       unlock_page(newpage);
> > -out_unlock:
> >       /* Drop an anon_vma reference if we took one */
> >       if (anon_vma)
> >               put_anon_vma(anon_vma);
> >       unlock_page(page);
> > -out:
> >       /*
> >        * If migration is successful, decrease refcount of the newpage,
> >        * which will not free the page because new page owner increased
> > @@ -1129,18 +1175,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >       return rc;
> >  }
> >
> > -/*
> > - * Obtain the lock on page, remove all ptes and migrate the page
> > - * to the newly allocated page in newpage.
> > - */
> > -static int unmap_and_move(new_page_t get_new_page,
> > -                                free_page_t put_new_page,
> > -                                unsigned long private, struct page *page,
> > -                                int force, enum migrate_mode mode,
> > -                                enum migrate_reason reason,
> > -                                struct list_head *ret)
> > +static void migrate_page_done(struct page *page,
> > +                           enum migrate_reason reason)
> > +{
> > +     /*
> > +      * Compaction can migrate also non-LRU pages which are
> > +      * not accounted to NR_ISOLATED_*. They can be recognized
> > +      * as __PageMovable
> > +      */
> > +     if (likely(!__PageMovable(page)))
> > +             mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> > +                                 page_is_file_lru(page), -thp_nr_pages(page));
> > +
> > +     if (reason != MR_MEMORY_FAILURE)
> > +             /* We release the page in page_handle_poison. */
> > +             put_page(page);
> > +}
> > +
> > +/* Obtain the lock on page, remove all ptes. */
> > +static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
> > +                           unsigned long private, struct page *page,
> > +                           struct page **newpagep, int force,
> > +                           enum migrate_mode mode, enum migrate_reason reason,
> > +                           struct list_head *ret)
> >  {
> > -     int rc = MIGRATEPAGE_SUCCESS;
> > +     int rc = MIGRATEPAGE_UNMAP;
> >       struct page *newpage = NULL;
> >
> >       if (!thp_migration_supported() && PageTransHuge(page))
> > @@ -1151,19 +1210,48 @@ static int unmap_and_move(new_page_t get_new_page,
> >               ClearPageActive(page);
> >               ClearPageUnevictable(page);
> >               /* free_pages_prepare() will clear PG_isolated. */
> > -             goto out;
> > +             list_del(&page->lru);
> > +             migrate_page_done(page, reason);
> > +             return MIGRATEPAGE_SUCCESS;
> >       }
> >
> >       newpage = get_new_page(page, private);
> >       if (!newpage)
> >               return -ENOMEM;
> > +     *newpagep = newpage;
> >
> > -     newpage->private = 0;
> > -     rc = __unmap_and_move(page, newpage, force, mode);
> > +     rc = __migrate_page_unmap(page, newpage, force, mode);
> > +     if (rc == MIGRATEPAGE_UNMAP)
> > +             return rc;
> > +
> > +     /*
> > +      * A page that has not been migrated will have kept its
> > +      * references and be restored.
> > +      */
> > +     /* restore the page to right list. */
> > +     if (rc != -EAGAIN)
> > +             list_move_tail(&page->lru, ret);
> > +
> > +     if (put_new_page)
> > +             put_new_page(newpage, private);
> > +     else
> > +             put_page(newpage);
> > +
> > +     return rc;
> > +}
> > +
> > +/* Migrate the page to the newly allocated page in newpage. */
> > +static int migrate_page_move(free_page_t put_new_page, unsigned long private,
> > +                          struct page *page, struct page *newpage,
> > +                          enum migrate_mode mode, enum migrate_reason reason,
> > +                          struct list_head *ret)
> > +{
> > +     int rc;
> > +
> > +     rc = __migrate_page_move(page, newpage, mode);
> >       if (rc == MIGRATEPAGE_SUCCESS)
> >               set_page_owner_migrate_reason(newpage, reason);
> >
> > -out:
> >       if (rc != -EAGAIN) {
> >               /*
> >                * A page that has been migrated has all references
> > @@ -1179,20 +1267,7 @@ static int unmap_and_move(new_page_t get_new_page,
> >        * we want to retry.
> >        */
> >       if (rc == MIGRATEPAGE_SUCCESS) {
> > -             /*
> > -              * Compaction can migrate also non-LRU pages which are
> > -              * not accounted to NR_ISOLATED_*. They can be recognized
> > -              * as __PageMovable
> > -              */
> > -             if (likely(!__PageMovable(page)))
> > -                     mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> > -                                     page_is_file_lru(page), -thp_nr_pages(page));
> > -
> > -             if (reason != MR_MEMORY_FAILURE)
> > -                     /*
> > -                      * We release the page in page_handle_poison.
> > -                      */
> > -                     put_page(page);
> > +             migrate_page_done(page, reason);
> >       } else {
> >               if (rc != -EAGAIN)
> >                       list_add_tail(&page->lru, ret);
> > @@ -1405,6 +1480,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> >       int pass = 0;
> >       bool is_thp = false;
> >       struct page *page;
> > +     struct page *newpage = NULL;
> >       struct page *page2;
> >       int rc, nr_subpages;
> >       LIST_HEAD(ret_pages);
> > @@ -1493,9 +1569,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> >                       if (PageHuge(page))
> >                               continue;
> >
> > -                     rc = unmap_and_move(get_new_page, put_new_page,
> > -                                             private, page, pass > 2, mode,
> > +                     rc = migrate_page_unmap(get_new_page, put_new_page, private,
> > +                                             page, &newpage, pass > 2, mode,
> >                                               reason, &ret_pages);
> > +                     if (rc == MIGRATEPAGE_UNMAP)
> > +                             rc = migrate_page_move(put_new_page, private,
> > +                                                    page, newpage, mode,
> > +                                                    reason, &ret_pages);
> >                       /*
> >                        * The rules are:
> >                        *      Success: page will be freed
Alistair Popple Sept. 27, 2022, 12:02 a.m. UTC | #6
Yang Shi <shy828301@gmail.com> writes:

> On Mon, Sep 26, 2022 at 2:37 AM Alistair Popple <apopple@nvidia.com> wrote:
>>
>>
>> Huang Ying <ying.huang@intel.com> writes:
>>
>> > This is a preparation patch to batch the page unmapping and moving
>> > for the normal pages and THP.
>> >
>> > In this patch, unmap_and_move() is split to migrate_page_unmap() and
>> > migrate_page_move().  So, we can batch _unmap() and _move() in
>> > different loops later.  To pass some information between unmap and
>> > move, the original unused newpage->mapping and newpage->private are
>> > used.
>>
>> This looks like it could cause a deadlock between two threads migrating
>> the same pages if force == true && mode != MIGRATE_ASYNC as
>> migrate_page_unmap() will call lock_page() while holding the lock on
>> other pages in the list. Therefore the two threads could deadlock if the
>> pages are in a different order.
>
> It seems unlikely to me since the page has to be isolated from lru
> before migration. The isolating from lru is atomic, so the two threads
> unlikely see the same pages on both lists.

Oh thanks! That is a good point and I agree since lru isolation is
atomic the two threads won't see the same pages. migrate_vma_setup()
does LRU isolation after locking the page which is why the potential
exists there. We could potentially switch that around but given
ZONE_DEVICE pages aren't on an lru it wouldn't help much.

> But there might be other cases which may incur deadlock, for example,
> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
> then write them back in a batch. The same pages may be on the
> migration list and they are also dirty and seen by writeback. I'm not
> sure whether I miss something that could prevent such a deadlock from
> happening.

I'm not overly familiar with that area but I would assume any filesystem
code doing this would already have to deal with deadlock potential.

>>
>> > Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
>> > Cc: Zi Yan <ziy@nvidia.com>
>> > Cc: Yang Shi <shy828301@gmail.com>
>> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> > Cc: Oscar Salvador <osalvador@suse.de>
>> > Cc: Matthew Wilcox <willy@infradead.org>
>> > ---
>> >  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
>> >  1 file changed, 122 insertions(+), 42 deletions(-)
>> >
>> > diff --git a/mm/migrate.c b/mm/migrate.c
>> > index 117134f1c6dc..4a81e0bfdbcd 100644
>> > --- a/mm/migrate.c
>> > +++ b/mm/migrate.c
>> > @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>> >       return rc;
>> >  }
>> >
>> > -static int __unmap_and_move(struct page *page, struct page *newpage,
>> > +static void __migrate_page_record(struct page *newpage,
>> > +                               int page_was_mapped,
>> > +                               struct anon_vma *anon_vma)
>> > +{
>> > +     newpage->mapping = (struct address_space *)anon_vma;
>> > +     newpage->private = page_was_mapped;
>> > +}
>> > +
>> > +static void __migrate_page_extract(struct page *newpage,
>> > +                                int *page_was_mappedp,
>> > +                                struct anon_vma **anon_vmap)
>> > +{
>> > +     *anon_vmap = (struct anon_vma *)newpage->mapping;
>> > +     *page_was_mappedp = newpage->private;
>> > +     newpage->mapping = NULL;
>> > +     newpage->private = 0;
>> > +}
>> > +
>> > +#define MIGRATEPAGE_UNMAP            1
>> > +
>> > +static int __migrate_page_unmap(struct page *page, struct page *newpage,
>> >                               int force, enum migrate_mode mode)
>> >  {
>> >       struct folio *folio = page_folio(page);
>> > -     struct folio *dst = page_folio(newpage);
>> >       int rc = -EAGAIN;
>> > -     bool page_was_mapped = false;
>> > +     int page_was_mapped = 0;
>> >       struct anon_vma *anon_vma = NULL;
>> >       bool is_lru = !__PageMovable(page);
>> >
>> > @@ -1058,8 +1077,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> >               goto out_unlock;
>> >
>> >       if (unlikely(!is_lru)) {
>> > -             rc = move_to_new_folio(dst, folio, mode);
>> > -             goto out_unlock_both;
>> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
>> > +             return MIGRATEPAGE_UNMAP;
>> >       }
>> >
>> >       /*
>> > @@ -1085,11 +1104,41 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> >               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
>> >                               page);
>> >               try_to_migrate(folio, 0);
>> > -             page_was_mapped = true;
>> > +             page_was_mapped = 1;
>> > +     }
>> > +
>> > +     if (!page_mapped(page)) {
>> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
>> > +             return MIGRATEPAGE_UNMAP;
>> >       }
>> >
>> > -     if (!page_mapped(page))
>> > -             rc = move_to_new_folio(dst, folio, mode);
>> > +     if (page_was_mapped)
>> > +             remove_migration_ptes(folio, folio, false);
>> > +
>> > +out_unlock_both:
>> > +     unlock_page(newpage);
>> > +out_unlock:
>> > +     /* Drop an anon_vma reference if we took one */
>> > +     if (anon_vma)
>> > +             put_anon_vma(anon_vma);
>> > +     unlock_page(page);
>> > +out:
>> > +
>> > +     return rc;
>> > +}
>> > +
>> > +static int __migrate_page_move(struct page *page, struct page *newpage,
>> > +                            enum migrate_mode mode)
>> > +{
>> > +     struct folio *folio = page_folio(page);
>> > +     struct folio *dst = page_folio(newpage);
>> > +     int rc;
>> > +     int page_was_mapped = 0;
>> > +     struct anon_vma *anon_vma = NULL;
>> > +
>> > +     __migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
>> > +
>> > +     rc = move_to_new_folio(dst, folio, mode);
>> >
>> >       /*
>> >        * When successful, push newpage to LRU immediately: so that if it
>> > @@ -1110,14 +1159,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> >               remove_migration_ptes(folio,
>> >                       rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
>> >
>> > -out_unlock_both:
>> >       unlock_page(newpage);
>> > -out_unlock:
>> >       /* Drop an anon_vma reference if we took one */
>> >       if (anon_vma)
>> >               put_anon_vma(anon_vma);
>> >       unlock_page(page);
>> > -out:
>> >       /*
>> >        * If migration is successful, decrease refcount of the newpage,
>> >        * which will not free the page because new page owner increased
>> > @@ -1129,18 +1175,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> >       return rc;
>> >  }
>> >
>> > -/*
>> > - * Obtain the lock on page, remove all ptes and migrate the page
>> > - * to the newly allocated page in newpage.
>> > - */
>> > -static int unmap_and_move(new_page_t get_new_page,
>> > -                                free_page_t put_new_page,
>> > -                                unsigned long private, struct page *page,
>> > -                                int force, enum migrate_mode mode,
>> > -                                enum migrate_reason reason,
>> > -                                struct list_head *ret)
>> > +static void migrate_page_done(struct page *page,
>> > +                           enum migrate_reason reason)
>> > +{
>> > +     /*
>> > +      * Compaction can migrate also non-LRU pages which are
>> > +      * not accounted to NR_ISOLATED_*. They can be recognized
>> > +      * as __PageMovable
>> > +      */
>> > +     if (likely(!__PageMovable(page)))
>> > +             mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
>> > +                                 page_is_file_lru(page), -thp_nr_pages(page));
>> > +
>> > +     if (reason != MR_MEMORY_FAILURE)
>> > +             /* We release the page in page_handle_poison. */
>> > +             put_page(page);
>> > +}
>> > +
>> > +/* Obtain the lock on page, remove all ptes. */
>> > +static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
>> > +                           unsigned long private, struct page *page,
>> > +                           struct page **newpagep, int force,
>> > +                           enum migrate_mode mode, enum migrate_reason reason,
>> > +                           struct list_head *ret)
>> >  {
>> > -     int rc = MIGRATEPAGE_SUCCESS;
>> > +     int rc = MIGRATEPAGE_UNMAP;
>> >       struct page *newpage = NULL;
>> >
>> >       if (!thp_migration_supported() && PageTransHuge(page))
>> > @@ -1151,19 +1210,48 @@ static int unmap_and_move(new_page_t get_new_page,
>> >               ClearPageActive(page);
>> >               ClearPageUnevictable(page);
>> >               /* free_pages_prepare() will clear PG_isolated. */
>> > -             goto out;
>> > +             list_del(&page->lru);
>> > +             migrate_page_done(page, reason);
>> > +             return MIGRATEPAGE_SUCCESS;
>> >       }
>> >
>> >       newpage = get_new_page(page, private);
>> >       if (!newpage)
>> >               return -ENOMEM;
>> > +     *newpagep = newpage;
>> >
>> > -     newpage->private = 0;
>> > -     rc = __unmap_and_move(page, newpage, force, mode);
>> > +     rc = __migrate_page_unmap(page, newpage, force, mode);
>> > +     if (rc == MIGRATEPAGE_UNMAP)
>> > +             return rc;
>> > +
>> > +     /*
>> > +      * A page that has not been migrated will have kept its
>> > +      * references and be restored.
>> > +      */
>> > +     /* restore the page to right list. */
>> > +     if (rc != -EAGAIN)
>> > +             list_move_tail(&page->lru, ret);
>> > +
>> > +     if (put_new_page)
>> > +             put_new_page(newpage, private);
>> > +     else
>> > +             put_page(newpage);
>> > +
>> > +     return rc;
>> > +}
>> > +
>> > +/* Migrate the page to the newly allocated page in newpage. */
>> > +static int migrate_page_move(free_page_t put_new_page, unsigned long private,
>> > +                          struct page *page, struct page *newpage,
>> > +                          enum migrate_mode mode, enum migrate_reason reason,
>> > +                          struct list_head *ret)
>> > +{
>> > +     int rc;
>> > +
>> > +     rc = __migrate_page_move(page, newpage, mode);
>> >       if (rc == MIGRATEPAGE_SUCCESS)
>> >               set_page_owner_migrate_reason(newpage, reason);
>> >
>> > -out:
>> >       if (rc != -EAGAIN) {
>> >               /*
>> >                * A page that has been migrated has all references
>> > @@ -1179,20 +1267,7 @@ static int unmap_and_move(new_page_t get_new_page,
>> >        * we want to retry.
>> >        */
>> >       if (rc == MIGRATEPAGE_SUCCESS) {
>> > -             /*
>> > -              * Compaction can migrate also non-LRU pages which are
>> > -              * not accounted to NR_ISOLATED_*. They can be recognized
>> > -              * as __PageMovable
>> > -              */
>> > -             if (likely(!__PageMovable(page)))
>> > -                     mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
>> > -                                     page_is_file_lru(page), -thp_nr_pages(page));
>> > -
>> > -             if (reason != MR_MEMORY_FAILURE)
>> > -                     /*
>> > -                      * We release the page in page_handle_poison.
>> > -                      */
>> > -                     put_page(page);
>> > +             migrate_page_done(page, reason);
>> >       } else {
>> >               if (rc != -EAGAIN)
>> >                       list_add_tail(&page->lru, ret);
>> > @@ -1405,6 +1480,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>> >       int pass = 0;
>> >       bool is_thp = false;
>> >       struct page *page;
>> > +     struct page *newpage = NULL;
>> >       struct page *page2;
>> >       int rc, nr_subpages;
>> >       LIST_HEAD(ret_pages);
>> > @@ -1493,9 +1569,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>> >                       if (PageHuge(page))
>> >                               continue;
>> >
>> > -                     rc = unmap_and_move(get_new_page, put_new_page,
>> > -                                             private, page, pass > 2, mode,
>> > +                     rc = migrate_page_unmap(get_new_page, put_new_page, private,
>> > +                                             page, &newpage, pass > 2, mode,
>> >                                               reason, &ret_pages);
>> > +                     if (rc == MIGRATEPAGE_UNMAP)
>> > +                             rc = migrate_page_move(put_new_page, private,
>> > +                                                    page, newpage, mode,
>> > +                                                    reason, &ret_pages);
>> >                       /*
>> >                        * The rules are:
>> >                        *      Success: page will be freed
Huang, Ying Sept. 27, 2022, 1:51 a.m. UTC | #7
Alistair Popple <apopple@nvidia.com> writes:

> Yang Shi <shy828301@gmail.com> writes:
>
>> On Mon, Sep 26, 2022 at 2:37 AM Alistair Popple <apopple@nvidia.com> wrote:
>>>
>>>
>>> Huang Ying <ying.huang@intel.com> writes:
>>>
>>> > This is a preparation patch to batch the page unmapping and moving
>>> > for the normal pages and THP.
>>> >
>>> > In this patch, unmap_and_move() is split to migrate_page_unmap() and
>>> > migrate_page_move().  So, we can batch _unmap() and _move() in
>>> > different loops later.  To pass some information between unmap and
>>> > move, the original unused newpage->mapping and newpage->private are
>>> > used.
>>>
>>> This looks like it could cause a deadlock between two threads migrating
>>> the same pages if force == true && mode != MIGRATE_ASYNC as
>>> migrate_page_unmap() will call lock_page() while holding the lock on
>>> other pages in the list. Therefore the two threads could deadlock if the
>>> pages are in a different order.
>>
>> It seems unlikely to me since the page has to be isolated from lru
>> before migration. The isolating from lru is atomic, so the two threads
>> unlikely see the same pages on both lists.
>
> Oh thanks! That is a good point and I agree since lru isolation is
> atomic the two threads won't see the same pages. migrate_vma_setup()
> does LRU isolation after locking the page which is why the potential
> exists there. We could potentially switch that around but given
> ZONE_DEVICE pages aren't on an lru it wouldn't help much.
>
>> But there might be other cases which may incur deadlock, for example,
>> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
>> then write them back in a batch. The same pages may be on the
>> migration list and they are also dirty and seen by writeback. I'm not
>> sure whether I miss something that could prevent such a deadlock from
>> happening.
>
> I'm not overly familiar with that area but I would assume any filesystem
> code doing this would already have to deal with deadlock potential.

Thank you very much for pointing this out.  I think the deadlock is a
real issue.  Anyway, we shouldn't forbid other places in kernel to lock
2 pages at the same time.

The simplest solution is to batch page migration only if mode ==
MIGRATE_ASYNC.  Then we may consider to fall back to non-batch mode if
mode != MIGRATE_ASYNC and trylock page fails.

Best Regards,
Huang, Ying

[snip]
John Hubbard Sept. 27, 2022, 8:34 p.m. UTC | #8
On 9/26/22 18:51, Huang, Ying wrote:
>>> But there might be other cases which may incur deadlock, for example,
>>> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
>>> then write them back in a batch. The same pages may be on the
>>> migration list and they are also dirty and seen by writeback. I'm not
>>> sure whether I miss something that could prevent such a deadlock from
>>> happening.
>>
>> I'm not overly familiar with that area but I would assume any filesystem
>> code doing this would already have to deal with deadlock potential.
> 
> Thank you very much for pointing this out.  I think the deadlock is a
> real issue.  Anyway, we shouldn't forbid other places in kernel to lock
> 2 pages at the same time.
> 

I also agree that we cannot make any rules such as "do not lock > 1 page
at the same time, elsewhere in the kernel", because it is already
happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
(15) pages per batch [1].

The only deadlock prevention convention that I see is the convention of
locking the pages in order of ascending address. That only helps if
everything does it that way, and migrate code definitely does not.
However...I thought that up until now, at least, the migrate code relied
on trylock (which can fail, and so migration can fail, too), to avoid
deadlock. Is that changing somehow, I didn't see it?


[1] https://elixir.bootlin.com/linux/latest/source/mm/page-writeback.c#L2296

thanks,
Yang Shi Sept. 27, 2022, 8:54 p.m. UTC | #9
On Mon, Sep 26, 2022 at 5:14 PM Alistair Popple <apopple@nvidia.com> wrote:
>
>
> Yang Shi <shy828301@gmail.com> writes:
>
> > On Mon, Sep 26, 2022 at 2:37 AM Alistair Popple <apopple@nvidia.com> wrote:
> >>
> >>
> >> Huang Ying <ying.huang@intel.com> writes:
> >>
> >> > This is a preparation patch to batch the page unmapping and moving
> >> > for the normal pages and THP.
> >> >
> >> > In this patch, unmap_and_move() is split to migrate_page_unmap() and
> >> > migrate_page_move().  So, we can batch _unmap() and _move() in
> >> > different loops later.  To pass some information between unmap and
> >> > move, the original unused newpage->mapping and newpage->private are
> >> > used.
> >>
> >> This looks like it could cause a deadlock between two threads migrating
> >> the same pages if force == true && mode != MIGRATE_ASYNC as
> >> migrate_page_unmap() will call lock_page() while holding the lock on
> >> other pages in the list. Therefore the two threads could deadlock if the
> >> pages are in a different order.
> >
> > It seems unlikely to me since the page has to be isolated from lru
> > before migration. The isolating from lru is atomic, so the two threads
> > unlikely see the same pages on both lists.
>
> Oh thanks! That is a good point and I agree since lru isolation is
> atomic the two threads won't see the same pages. migrate_vma_setup()
> does LRU isolation after locking the page which is why the potential
> exists there. We could potentially switch that around but given
> ZONE_DEVICE pages aren't on an lru it wouldn't help much.

Aha, I see. It has a different lock - isolation order from regular pages.

>
> > But there might be other cases which may incur deadlock, for example,
> > filesystem writeback IIUC. Some filesystems may lock a bunch of pages
> > then write them back in a batch. The same pages may be on the
> > migration list and they are also dirty and seen by writeback. I'm not
> > sure whether I miss something that could prevent such a deadlock from
> > happening.
>
> I'm not overly familiar with that area but I would assume any filesystem
> code doing this would already have to deal with deadlock potential.

AFAIK, actually not IIUC. For example, write back just simply look up
page cache and lock them one by one.

>
> >>
> >> > Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> >> > Cc: Zi Yan <ziy@nvidia.com>
> >> > Cc: Yang Shi <shy828301@gmail.com>
> >> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> >> > Cc: Oscar Salvador <osalvador@suse.de>
> >> > Cc: Matthew Wilcox <willy@infradead.org>
> >> > ---
> >> >  mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++-------------
> >> >  1 file changed, 122 insertions(+), 42 deletions(-)
> >> >
> >> > diff --git a/mm/migrate.c b/mm/migrate.c
> >> > index 117134f1c6dc..4a81e0bfdbcd 100644
> >> > --- a/mm/migrate.c
> >> > +++ b/mm/migrate.c
> >> > @@ -976,13 +976,32 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
> >> >       return rc;
> >> >  }
> >> >
> >> > -static int __unmap_and_move(struct page *page, struct page *newpage,
> >> > +static void __migrate_page_record(struct page *newpage,
> >> > +                               int page_was_mapped,
> >> > +                               struct anon_vma *anon_vma)
> >> > +{
> >> > +     newpage->mapping = (struct address_space *)anon_vma;
> >> > +     newpage->private = page_was_mapped;
> >> > +}
> >> > +
> >> > +static void __migrate_page_extract(struct page *newpage,
> >> > +                                int *page_was_mappedp,
> >> > +                                struct anon_vma **anon_vmap)
> >> > +{
> >> > +     *anon_vmap = (struct anon_vma *)newpage->mapping;
> >> > +     *page_was_mappedp = newpage->private;
> >> > +     newpage->mapping = NULL;
> >> > +     newpage->private = 0;
> >> > +}
> >> > +
> >> > +#define MIGRATEPAGE_UNMAP            1
> >> > +
> >> > +static int __migrate_page_unmap(struct page *page, struct page *newpage,
> >> >                               int force, enum migrate_mode mode)
> >> >  {
> >> >       struct folio *folio = page_folio(page);
> >> > -     struct folio *dst = page_folio(newpage);
> >> >       int rc = -EAGAIN;
> >> > -     bool page_was_mapped = false;
> >> > +     int page_was_mapped = 0;
> >> >       struct anon_vma *anon_vma = NULL;
> >> >       bool is_lru = !__PageMovable(page);
> >> >
> >> > @@ -1058,8 +1077,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >> >               goto out_unlock;
> >> >
> >> >       if (unlikely(!is_lru)) {
> >> > -             rc = move_to_new_folio(dst, folio, mode);
> >> > -             goto out_unlock_both;
> >> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
> >> > +             return MIGRATEPAGE_UNMAP;
> >> >       }
> >> >
> >> >       /*
> >> > @@ -1085,11 +1104,41 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >> >               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
> >> >                               page);
> >> >               try_to_migrate(folio, 0);
> >> > -             page_was_mapped = true;
> >> > +             page_was_mapped = 1;
> >> > +     }
> >> > +
> >> > +     if (!page_mapped(page)) {
> >> > +             __migrate_page_record(newpage, page_was_mapped, anon_vma);
> >> > +             return MIGRATEPAGE_UNMAP;
> >> >       }
> >> >
> >> > -     if (!page_mapped(page))
> >> > -             rc = move_to_new_folio(dst, folio, mode);
> >> > +     if (page_was_mapped)
> >> > +             remove_migration_ptes(folio, folio, false);
> >> > +
> >> > +out_unlock_both:
> >> > +     unlock_page(newpage);
> >> > +out_unlock:
> >> > +     /* Drop an anon_vma reference if we took one */
> >> > +     if (anon_vma)
> >> > +             put_anon_vma(anon_vma);
> >> > +     unlock_page(page);
> >> > +out:
> >> > +
> >> > +     return rc;
> >> > +}
> >> > +
> >> > +static int __migrate_page_move(struct page *page, struct page *newpage,
> >> > +                            enum migrate_mode mode)
> >> > +{
> >> > +     struct folio *folio = page_folio(page);
> >> > +     struct folio *dst = page_folio(newpage);
> >> > +     int rc;
> >> > +     int page_was_mapped = 0;
> >> > +     struct anon_vma *anon_vma = NULL;
> >> > +
> >> > +     __migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
> >> > +
> >> > +     rc = move_to_new_folio(dst, folio, mode);
> >> >
> >> >       /*
> >> >        * When successful, push newpage to LRU immediately: so that if it
> >> > @@ -1110,14 +1159,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >> >               remove_migration_ptes(folio,
> >> >                       rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
> >> >
> >> > -out_unlock_both:
> >> >       unlock_page(newpage);
> >> > -out_unlock:
> >> >       /* Drop an anon_vma reference if we took one */
> >> >       if (anon_vma)
> >> >               put_anon_vma(anon_vma);
> >> >       unlock_page(page);
> >> > -out:
> >> >       /*
> >> >        * If migration is successful, decrease refcount of the newpage,
> >> >        * which will not free the page because new page owner increased
> >> > @@ -1129,18 +1175,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >> >       return rc;
> >> >  }
> >> >
> >> > -/*
> >> > - * Obtain the lock on page, remove all ptes and migrate the page
> >> > - * to the newly allocated page in newpage.
> >> > - */
> >> > -static int unmap_and_move(new_page_t get_new_page,
> >> > -                                free_page_t put_new_page,
> >> > -                                unsigned long private, struct page *page,
> >> > -                                int force, enum migrate_mode mode,
> >> > -                                enum migrate_reason reason,
> >> > -                                struct list_head *ret)
> >> > +static void migrate_page_done(struct page *page,
> >> > +                           enum migrate_reason reason)
> >> > +{
> >> > +     /*
> >> > +      * Compaction can migrate also non-LRU pages which are
> >> > +      * not accounted to NR_ISOLATED_*. They can be recognized
> >> > +      * as __PageMovable
> >> > +      */
> >> > +     if (likely(!__PageMovable(page)))
> >> > +             mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> >> > +                                 page_is_file_lru(page), -thp_nr_pages(page));
> >> > +
> >> > +     if (reason != MR_MEMORY_FAILURE)
> >> > +             /* We release the page in page_handle_poison. */
> >> > +             put_page(page);
> >> > +}
> >> > +
> >> > +/* Obtain the lock on page, remove all ptes. */
> >> > +static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
> >> > +                           unsigned long private, struct page *page,
> >> > +                           struct page **newpagep, int force,
> >> > +                           enum migrate_mode mode, enum migrate_reason reason,
> >> > +                           struct list_head *ret)
> >> >  {
> >> > -     int rc = MIGRATEPAGE_SUCCESS;
> >> > +     int rc = MIGRATEPAGE_UNMAP;
> >> >       struct page *newpage = NULL;
> >> >
> >> >       if (!thp_migration_supported() && PageTransHuge(page))
> >> > @@ -1151,19 +1210,48 @@ static int unmap_and_move(new_page_t get_new_page,
> >> >               ClearPageActive(page);
> >> >               ClearPageUnevictable(page);
> >> >               /* free_pages_prepare() will clear PG_isolated. */
> >> > -             goto out;
> >> > +             list_del(&page->lru);
> >> > +             migrate_page_done(page, reason);
> >> > +             return MIGRATEPAGE_SUCCESS;
> >> >       }
> >> >
> >> >       newpage = get_new_page(page, private);
> >> >       if (!newpage)
> >> >               return -ENOMEM;
> >> > +     *newpagep = newpage;
> >> >
> >> > -     newpage->private = 0;
> >> > -     rc = __unmap_and_move(page, newpage, force, mode);
> >> > +     rc = __migrate_page_unmap(page, newpage, force, mode);
> >> > +     if (rc == MIGRATEPAGE_UNMAP)
> >> > +             return rc;
> >> > +
> >> > +     /*
> >> > +      * A page that has not been migrated will have kept its
> >> > +      * references and be restored.
> >> > +      */
> >> > +     /* restore the page to right list. */
> >> > +     if (rc != -EAGAIN)
> >> > +             list_move_tail(&page->lru, ret);
> >> > +
> >> > +     if (put_new_page)
> >> > +             put_new_page(newpage, private);
> >> > +     else
> >> > +             put_page(newpage);
> >> > +
> >> > +     return rc;
> >> > +}
> >> > +
> >> > +/* Migrate the page to the newly allocated page in newpage. */
> >> > +static int migrate_page_move(free_page_t put_new_page, unsigned long private,
> >> > +                          struct page *page, struct page *newpage,
> >> > +                          enum migrate_mode mode, enum migrate_reason reason,
> >> > +                          struct list_head *ret)
> >> > +{
> >> > +     int rc;
> >> > +
> >> > +     rc = __migrate_page_move(page, newpage, mode);
> >> >       if (rc == MIGRATEPAGE_SUCCESS)
> >> >               set_page_owner_migrate_reason(newpage, reason);
> >> >
> >> > -out:
> >> >       if (rc != -EAGAIN) {
> >> >               /*
> >> >                * A page that has been migrated has all references
> >> > @@ -1179,20 +1267,7 @@ static int unmap_and_move(new_page_t get_new_page,
> >> >        * we want to retry.
> >> >        */
> >> >       if (rc == MIGRATEPAGE_SUCCESS) {
> >> > -             /*
> >> > -              * Compaction can migrate also non-LRU pages which are
> >> > -              * not accounted to NR_ISOLATED_*. They can be recognized
> >> > -              * as __PageMovable
> >> > -              */
> >> > -             if (likely(!__PageMovable(page)))
> >> > -                     mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> >> > -                                     page_is_file_lru(page), -thp_nr_pages(page));
> >> > -
> >> > -             if (reason != MR_MEMORY_FAILURE)
> >> > -                     /*
> >> > -                      * We release the page in page_handle_poison.
> >> > -                      */
> >> > -                     put_page(page);
> >> > +             migrate_page_done(page, reason);
> >> >       } else {
> >> >               if (rc != -EAGAIN)
> >> >                       list_add_tail(&page->lru, ret);
> >> > @@ -1405,6 +1480,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> >> >       int pass = 0;
> >> >       bool is_thp = false;
> >> >       struct page *page;
> >> > +     struct page *newpage = NULL;
> >> >       struct page *page2;
> >> >       int rc, nr_subpages;
> >> >       LIST_HEAD(ret_pages);
> >> > @@ -1493,9 +1569,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> >> >                       if (PageHuge(page))
> >> >                               continue;
> >> >
> >> > -                     rc = unmap_and_move(get_new_page, put_new_page,
> >> > -                                             private, page, pass > 2, mode,
> >> > +                     rc = migrate_page_unmap(get_new_page, put_new_page, private,
> >> > +                                             page, &newpage, pass > 2, mode,
> >> >                                               reason, &ret_pages);
> >> > +                     if (rc == MIGRATEPAGE_UNMAP)
> >> > +                             rc = migrate_page_move(put_new_page, private,
> >> > +                                                    page, newpage, mode,
> >> > +                                                    reason, &ret_pages);
> >> >                       /*
> >> >                        * The rules are:
> >> >                        *      Success: page will be freed
Yang Shi Sept. 27, 2022, 8:56 p.m. UTC | #10
On Mon, Sep 26, 2022 at 6:52 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Alistair Popple <apopple@nvidia.com> writes:
>
> > Yang Shi <shy828301@gmail.com> writes:
> >
> >> On Mon, Sep 26, 2022 at 2:37 AM Alistair Popple <apopple@nvidia.com> wrote:
> >>>
> >>>
> >>> Huang Ying <ying.huang@intel.com> writes:
> >>>
> >>> > This is a preparation patch to batch the page unmapping and moving
> >>> > for the normal pages and THP.
> >>> >
> >>> > In this patch, unmap_and_move() is split to migrate_page_unmap() and
> >>> > migrate_page_move().  So, we can batch _unmap() and _move() in
> >>> > different loops later.  To pass some information between unmap and
> >>> > move, the original unused newpage->mapping and newpage->private are
> >>> > used.
> >>>
> >>> This looks like it could cause a deadlock between two threads migrating
> >>> the same pages if force == true && mode != MIGRATE_ASYNC as
> >>> migrate_page_unmap() will call lock_page() while holding the lock on
> >>> other pages in the list. Therefore the two threads could deadlock if the
> >>> pages are in a different order.
> >>
> >> It seems unlikely to me since the page has to be isolated from lru
> >> before migration. The isolating from lru is atomic, so the two threads
> >> unlikely see the same pages on both lists.
> >
> > Oh thanks! That is a good point and I agree since lru isolation is
> > atomic the two threads won't see the same pages. migrate_vma_setup()
> > does LRU isolation after locking the page which is why the potential
> > exists there. We could potentially switch that around but given
> > ZONE_DEVICE pages aren't on an lru it wouldn't help much.
> >
> >> But there might be other cases which may incur deadlock, for example,
> >> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
> >> then write them back in a batch. The same pages may be on the
> >> migration list and they are also dirty and seen by writeback. I'm not
> >> sure whether I miss something that could prevent such a deadlock from
> >> happening.
> >
> > I'm not overly familiar with that area but I would assume any filesystem
> > code doing this would already have to deal with deadlock potential.
>
> Thank you very much for pointing this out.  I think the deadlock is a
> real issue.  Anyway, we shouldn't forbid other places in kernel to lock
> 2 pages at the same time.
>
> The simplest solution is to batch page migration only if mode ==
> MIGRATE_ASYNC.  Then we may consider to fall back to non-batch mode if
> mode != MIGRATE_ASYNC and trylock page fails.

Seems like so...

>
> Best Regards,
> Huang, Ying
>
> [snip]
Yang Shi Sept. 27, 2022, 8:57 p.m. UTC | #11
On Tue, Sep 27, 2022 at 1:35 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 9/26/22 18:51, Huang, Ying wrote:
> >>> But there might be other cases which may incur deadlock, for example,
> >>> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
> >>> then write them back in a batch. The same pages may be on the
> >>> migration list and they are also dirty and seen by writeback. I'm not
> >>> sure whether I miss something that could prevent such a deadlock from
> >>> happening.
> >>
> >> I'm not overly familiar with that area but I would assume any filesystem
> >> code doing this would already have to deal with deadlock potential.
> >
> > Thank you very much for pointing this out.  I think the deadlock is a
> > real issue.  Anyway, we shouldn't forbid other places in kernel to lock
> > 2 pages at the same time.
> >
>
> I also agree that we cannot make any rules such as "do not lock > 1 page
> at the same time, elsewhere in the kernel", because it is already
> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
> (15) pages per batch [1].
>
> The only deadlock prevention convention that I see is the convention of
> locking the pages in order of ascending address. That only helps if
> everything does it that way, and migrate code definitely does not.
> However...I thought that up until now, at least, the migrate code relied
> on trylock (which can fail, and so migration can fail, too), to avoid
> deadlock. Is that changing somehow, I didn't see it?

The trylock is used by async mode which does try to avoid blocking.
But sync mode does use lock. The current implementation of migration
does migrate one page at a time, so it is not a problem.

>
>
> [1] https://elixir.bootlin.com/linux/latest/source/mm/page-writeback.c#L2296
>
> thanks,
>
> --
> John Hubbard
> NVIDIA
>
> > The simplest solution is to batch page migration only if mode ==
> > MIGRATE_ASYNC.  Then we may consider to fall back to non-batch mode if
> > mode != MIGRATE_ASYNC and trylock page fails.
> >
>
>
Alistair Popple Sept. 28, 2022, 12:59 a.m. UTC | #12
Yang Shi <shy828301@gmail.com> writes:

> On Tue, Sep 27, 2022 at 1:35 PM John Hubbard <jhubbard@nvidia.com> wrote:
>>
>> On 9/26/22 18:51, Huang, Ying wrote:
>> >>> But there might be other cases which may incur deadlock, for example,
>> >>> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
>> >>> then write them back in a batch. The same pages may be on the
>> >>> migration list and they are also dirty and seen by writeback. I'm not
>> >>> sure whether I miss something that could prevent such a deadlock from
>> >>> happening.
>> >>
>> >> I'm not overly familiar with that area but I would assume any filesystem
>> >> code doing this would already have to deal with deadlock potential.
>> >
>> > Thank you very much for pointing this out.  I think the deadlock is a
>> > real issue.  Anyway, we shouldn't forbid other places in kernel to lock
>> > 2 pages at the same time.
>> >
>>
>> I also agree that we cannot make any rules such as "do not lock > 1 page
>> at the same time, elsewhere in the kernel", because it is already
>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
>> (15) pages per batch [1].

That's not really the case though. The inner loop of write_cache_page()
only ever locks one page at a time, either directly via the
unlock_page() on L2338 (those goto's are amazing) or indirectly via
(*writepage)() on L2359.

So there's no deadlock potential there because unlocking any previously
locked page(s) doesn't depend on obtaining the lock for another page.
Unless I've missed something?

>> The only deadlock prevention convention that I see is the convention of
>> locking the pages in order of ascending address. That only helps if
>> everything does it that way, and migrate code definitely does not.
>> However...I thought that up until now, at least, the migrate code relied
>> on trylock (which can fail, and so migration can fail, too), to avoid
>> deadlock. Is that changing somehow, I didn't see it?
>
> The trylock is used by async mode which does try to avoid blocking.
> But sync mode does use lock. The current implementation of migration
> does migrate one page at a time, so it is not a problem.
>
>>
>>
>> [1] https://elixir.bootlin.com/linux/latest/source/mm/page-writeback.c#L2296
>>
>> thanks,
>>
>> --
>> John Hubbard
>> NVIDIA
>>
>> > The simplest solution is to batch page migration only if mode ==
>> > MIGRATE_ASYNC.  Then we may consider to fall back to non-batch mode if
>> > mode != MIGRATE_ASYNC and trylock page fails.
>> >
>>
>>
Huang, Ying Sept. 28, 2022, 1:41 a.m. UTC | #13
Alistair Popple <apopple@nvidia.com> writes:

> Yang Shi <shy828301@gmail.com> writes:
>
>> On Tue, Sep 27, 2022 at 1:35 PM John Hubbard <jhubbard@nvidia.com> wrote:
>>>
>>> On 9/26/22 18:51, Huang, Ying wrote:
>>> >>> But there might be other cases which may incur deadlock, for example,
>>> >>> filesystem writeback IIUC. Some filesystems may lock a bunch of pages
>>> >>> then write them back in a batch. The same pages may be on the
>>> >>> migration list and they are also dirty and seen by writeback. I'm not
>>> >>> sure whether I miss something that could prevent such a deadlock from
>>> >>> happening.
>>> >>
>>> >> I'm not overly familiar with that area but I would assume any filesystem
>>> >> code doing this would already have to deal with deadlock potential.
>>> >
>>> > Thank you very much for pointing this out.  I think the deadlock is a
>>> > real issue.  Anyway, we shouldn't forbid other places in kernel to lock
>>> > 2 pages at the same time.
>>> >
>>>
>>> I also agree that we cannot make any rules such as "do not lock > 1 page
>>> at the same time, elsewhere in the kernel", because it is already
>>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
>>> (15) pages per batch [1].
>
> That's not really the case though. The inner loop of write_cache_page()
> only ever locks one page at a time, either directly via the
> unlock_page() on L2338 (those goto's are amazing) or indirectly via
> (*writepage)() on L2359.
>
> So there's no deadlock potential there because unlocking any previously
> locked page(s) doesn't depend on obtaining the lock for another page.
> Unless I've missed something?

Yes.  This is my understanding too after checking ext4_writepage().

Best Regards,
Huang, Ying

>>> The only deadlock prevention convention that I see is the convention of
>>> locking the pages in order of ascending address. That only helps if
>>> everything does it that way, and migrate code definitely does not.
>>> However...I thought that up until now, at least, the migrate code relied
>>> on trylock (which can fail, and so migration can fail, too), to avoid
>>> deadlock. Is that changing somehow, I didn't see it?
>>
>> The trylock is used by async mode which does try to avoid blocking.
>> But sync mode does use lock. The current implementation of migration
>> does migrate one page at a time, so it is not a problem.
>>
>>>
>>>
>>> [1] https://elixir.bootlin.com/linux/latest/source/mm/page-writeback.c#L2296
>>>
>>> thanks,
>>>
>>> --
>>> John Hubbard
>>> NVIDIA
>>>
>>> > The simplest solution is to batch page migration only if mode ==
>>> > MIGRATE_ASYNC.  Then we may consider to fall back to non-batch mode if
>>> > mode != MIGRATE_ASYNC and trylock page fails.
>>> >
>>>
>>>
John Hubbard Sept. 28, 2022, 1:44 a.m. UTC | #14
On 9/27/22 18:41, Huang, Ying wrote:
>>>> I also agree that we cannot make any rules such as "do not lock > 1 page
>>>> at the same time, elsewhere in the kernel", because it is already
>>>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
>>>> (15) pages per batch [1].
>>
>> That's not really the case though. The inner loop of write_cache_page()
>> only ever locks one page at a time, either directly via the
>> unlock_page() on L2338 (those goto's are amazing) or indirectly via
>> (*writepage)() on L2359.
>>
>> So there's no deadlock potential there because unlocking any previously
>> locked page(s) doesn't depend on obtaining the lock for another page.
>> Unless I've missed something?
> 
> Yes.  This is my understanding too after checking ext4_writepage().
> 

Yes, I missed the ".writepage() shall unlock the page" design point. Now
it seems much more reasonable and safer. :)

thanks,
Yang Shi Sept. 28, 2022, 1:49 a.m. UTC | #15
On Tue, Sep 27, 2022 at 6:45 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 9/27/22 18:41, Huang, Ying wrote:
> >>>> I also agree that we cannot make any rules such as "do not lock > 1 page
> >>>> at the same time, elsewhere in the kernel", because it is already
> >>>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
> >>>> (15) pages per batch [1].
> >>
> >> That's not really the case though. The inner loop of write_cache_page()
> >> only ever locks one page at a time, either directly via the
> >> unlock_page() on L2338 (those goto's are amazing) or indirectly via
> >> (*writepage)() on L2359.
> >>
> >> So there's no deadlock potential there because unlocking any previously
> >> locked page(s) doesn't depend on obtaining the lock for another page.
> >> Unless I've missed something?
> >
> > Yes.  This is my understanding too after checking ext4_writepage().
> >
>
> Yes, I missed the ".writepage() shall unlock the page" design point. Now
> it seems much more reasonable and safer. :)

.writepage is deprecated (see
https://lore.kernel.org/linux-fsdevel/20220719041311.709250-1-hch@lst.de/),
write back actually uses .writepages.

>
> thanks,
>
> --
> John Hubbard
> NVIDIA
>
John Hubbard Sept. 28, 2022, 1:56 a.m. UTC | #16
On 9/27/22 18:49, Yang Shi wrote:
> On Tue, Sep 27, 2022 at 6:45 PM John Hubbard <jhubbard@nvidia.com> wrote:
>>
>> On 9/27/22 18:41, Huang, Ying wrote:
>>>>>> I also agree that we cannot make any rules such as "do not lock > 1 page
>>>>>> at the same time, elsewhere in the kernel", because it is already
>>>>>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
>>>>>> (15) pages per batch [1].
>>>>
>>>> That's not really the case though. The inner loop of write_cache_page()
>>>> only ever locks one page at a time, either directly via the
>>>> unlock_page() on L2338 (those goto's are amazing) or indirectly via
>>>> (*writepage)() on L2359.
>>>>
>>>> So there's no deadlock potential there because unlocking any previously
>>>> locked page(s) doesn't depend on obtaining the lock for another page.
>>>> Unless I've missed something?
>>>
>>> Yes.  This is my understanding too after checking ext4_writepage().
>>>
>>
>> Yes, I missed the ".writepage() shall unlock the page" design point. Now
>> it seems much more reasonable and safer. :)
> 
> .writepage is deprecated (see
> https://lore.kernel.org/linux-fsdevel/20220719041311.709250-1-hch@lst.de/),
> write back actually uses .writepages.

write_cache_pages() seems to directly call it, though:

generic_writepages()
  write_cache_pages(__writepage)
    __writepage()
      mapping->a_ops->writepage(page, wbc)

So it seems like it's still alive and well. And in any case, it is definitely
passing one page at a time from write_cache_pages(), right?


 thanks,
Yang Shi Sept. 28, 2022, 2:14 a.m. UTC | #17
On Tue, Sep 27, 2022 at 6:56 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 9/27/22 18:49, Yang Shi wrote:
> > On Tue, Sep 27, 2022 at 6:45 PM John Hubbard <jhubbard@nvidia.com> wrote:
> >>
> >> On 9/27/22 18:41, Huang, Ying wrote:
> >>>>>> I also agree that we cannot make any rules such as "do not lock > 1 page
> >>>>>> at the same time, elsewhere in the kernel", because it is already
> >>>>>> happening, for example in page-writeback.c, which locks PAGEVEC_SIZE
> >>>>>> (15) pages per batch [1].
> >>>>
> >>>> That's not really the case though. The inner loop of write_cache_page()
> >>>> only ever locks one page at a time, either directly via the
> >>>> unlock_page() on L2338 (those goto's are amazing) or indirectly via
> >>>> (*writepage)() on L2359.
> >>>>
> >>>> So there's no deadlock potential there because unlocking any previously
> >>>> locked page(s) doesn't depend on obtaining the lock for another page.
> >>>> Unless I've missed something?
> >>>
> >>> Yes.  This is my understanding too after checking ext4_writepage().
> >>>
> >>
> >> Yes, I missed the ".writepage() shall unlock the page" design point. Now
> >> it seems much more reasonable and safer. :)
> >
> > .writepage is deprecated (see
> > https://lore.kernel.org/linux-fsdevel/20220719041311.709250-1-hch@lst.de/),
> > write back actually uses .writepages.
>
> write_cache_pages() seems to directly call it, though:
>
> generic_writepages()
>   write_cache_pages(__writepage)
>     __writepage()
>       mapping->a_ops->writepage(page, wbc)
>
> So it seems like it's still alive and well. And in any case, it is definitely
> passing one page at a time from write_cache_pages(), right?

IIRC, the writeback may not call generic_writepages. On my ext4
filesystem, the writeback call stack looks like:

@[
    ext4_writepages+1
    do_writepages+191
    __writeback_single_inode+65
    writeback_sb_inodes+477
    __writeback_inodes_wb+76
    wb_writeback+457
    wb_workfn+680
    process_one_work+485
    worker_thread+80
    kthread+231
    ret_from_fork+34
]: 2


>
>
>  thanks,
>
> --
> John Hubbard
> NVIDIA
>
John Hubbard Sept. 28, 2022, 2:57 a.m. UTC | #18
On 9/27/22 19:14, Yang Shi wrote:
> IIRC, the writeback may not call generic_writepages. On my ext4
> filesystem, the writeback call stack looks like:
> 
> @[
>     ext4_writepages+1
>     do_writepages+191
>     __writeback_single_inode+65
>     writeback_sb_inodes+477
>     __writeback_inodes_wb+76
>     wb_writeback+457
>     wb_workfn+680
>     process_one_work+485
>     worker_thread+80
>     kthread+231
>     ret_from_fork+34
> ]: 2
> 

Sure, that's fine for ext4, in that particular case, but

a) not all filesystems have .writepages hooked up. That's why
do_writepages() checks for .writepages(), and falls back to
.writepage():

	if (mapping->a_ops->writepages)
		ret = mapping->a_ops->writepages(mapping, wbc);
	else
		ret = generic_writepages(mapping, wbc);

, and

b) there are also a lot of places and ways to invoke writebacks. There
are periodic writebacks, and there are data integrity (WB_SYNC_ALL)
writebacks, and other places where a page needs to be cleaned--so, a lot
of call sites. Some of which will land on a .writepage() sometimes, even
now.

For just one example, I see migrate.c's writeout() function directly
calling writepage():

	rc = mapping->a_ops->writepage(&folio->page, &wbc);


thanks,
Yang Shi Sept. 28, 2022, 3:25 a.m. UTC | #19
On Tue, Sep 27, 2022 at 7:57 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 9/27/22 19:14, Yang Shi wrote:
> > IIRC, the writeback may not call generic_writepages. On my ext4
> > filesystem, the writeback call stack looks like:
> >
> > @[
> >     ext4_writepages+1
> >     do_writepages+191
> >     __writeback_single_inode+65
> >     writeback_sb_inodes+477
> >     __writeback_inodes_wb+76
> >     wb_writeback+457
> >     wb_workfn+680
> >     process_one_work+485
> >     worker_thread+80
> >     kthread+231
> >     ret_from_fork+34
> > ]: 2
> >
>
> Sure, that's fine for ext4, in that particular case, but
>
> a) not all filesystems have .writepages hooked up. That's why
> do_writepages() checks for .writepages(), and falls back to
> .writepage():
>
>         if (mapping->a_ops->writepages)
>                 ret = mapping->a_ops->writepages(mapping, wbc);
>         else
>                 ret = generic_writepages(mapping, wbc);
>
> , and
>
> b) there are also a lot of places and ways to invoke writebacks. There
> are periodic writebacks, and there are data integrity (WB_SYNC_ALL)
> writebacks, and other places where a page needs to be cleaned--so, a lot
> of call sites. Some of which will land on a .writepage() sometimes, even
> now.
>
> For just one example, I see migrate.c's writeout() function directly
> calling writepage():
>
>         rc = mapping->a_ops->writepage(&folio->page, &wbc);

Thanks, John. You are right. I think "deprecated" is inaccurate,
.writepage is actually no longer used in memory reclaim context, but
it is still used for other contexts.

Anyway I think what we tried to figure out in the first place is
whether it is possible that filesystem writeback have dead lock with
the new batch migration or not. I think the conclusion didn't change.

>
>
> thanks,
>
> --
> John Hubbard
> NVIDIA
>
Yang Shi Sept. 28, 2022, 3:39 a.m. UTC | #20
On Tue, Sep 27, 2022 at 8:25 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Tue, Sep 27, 2022 at 7:57 PM John Hubbard <jhubbard@nvidia.com> wrote:
> >
> > On 9/27/22 19:14, Yang Shi wrote:
> > > IIRC, the writeback may not call generic_writepages. On my ext4
> > > filesystem, the writeback call stack looks like:
> > >
> > > @[
> > >     ext4_writepages+1
> > >     do_writepages+191
> > >     __writeback_single_inode+65
> > >     writeback_sb_inodes+477
> > >     __writeback_inodes_wb+76
> > >     wb_writeback+457
> > >     wb_workfn+680
> > >     process_one_work+485
> > >     worker_thread+80
> > >     kthread+231
> > >     ret_from_fork+34
> > > ]: 2
> > >
> >
> > Sure, that's fine for ext4, in that particular case, but
> >
> > a) not all filesystems have .writepages hooked up. That's why
> > do_writepages() checks for .writepages(), and falls back to
> > .writepage():
> >
> >         if (mapping->a_ops->writepages)
> >                 ret = mapping->a_ops->writepages(mapping, wbc);
> >         else
> >                 ret = generic_writepages(mapping, wbc);
> >
> > , and
> >
> > b) there are also a lot of places and ways to invoke writebacks. There
> > are periodic writebacks, and there are data integrity (WB_SYNC_ALL)
> > writebacks, and other places where a page needs to be cleaned--so, a lot
> > of call sites. Some of which will land on a .writepage() sometimes, even
> > now.
> >
> > For just one example, I see migrate.c's writeout() function directly
> > calling writepage():
> >
> >         rc = mapping->a_ops->writepage(&folio->page, &wbc);
>
> Thanks, John. You are right. I think "deprecated" is inaccurate,
> .writepage is actually no longer used in memory reclaim context, but
> it is still used for other contexts.

Hmm.. it is definitely used currently, but it seems like the plan is
to deprecate ->writepage finally IIUC. See
https://lore.kernel.org/linux-mm/YvQYjpDHH5KckCrw@casper.infradead.org/

>
> Anyway I think what we tried to figure out in the first place is
> whether it is possible that filesystem writeback have dead lock with
> the new batch migration or not. I think the conclusion didn't change.
>
> >
> >
> > thanks,
> >
> > --
> > John Hubbard
> > NVIDIA
> >
diff mbox series

Patch

diff --git a/mm/migrate.c b/mm/migrate.c
index 117134f1c6dc..4a81e0bfdbcd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -976,13 +976,32 @@  static int move_to_new_folio(struct folio *dst, struct folio *src,
 	return rc;
 }
 
-static int __unmap_and_move(struct page *page, struct page *newpage,
+static void __migrate_page_record(struct page *newpage,
+				  int page_was_mapped,
+				  struct anon_vma *anon_vma)
+{
+	newpage->mapping = (struct address_space *)anon_vma;
+	newpage->private = page_was_mapped;
+}
+
+static void __migrate_page_extract(struct page *newpage,
+				   int *page_was_mappedp,
+				   struct anon_vma **anon_vmap)
+{
+	*anon_vmap = (struct anon_vma *)newpage->mapping;
+	*page_was_mappedp = newpage->private;
+	newpage->mapping = NULL;
+	newpage->private = 0;
+}
+
+#define MIGRATEPAGE_UNMAP		1
+
+static int __migrate_page_unmap(struct page *page, struct page *newpage,
 				int force, enum migrate_mode mode)
 {
 	struct folio *folio = page_folio(page);
-	struct folio *dst = page_folio(newpage);
 	int rc = -EAGAIN;
-	bool page_was_mapped = false;
+	int page_was_mapped = 0;
 	struct anon_vma *anon_vma = NULL;
 	bool is_lru = !__PageMovable(page);
 
@@ -1058,8 +1077,8 @@  static int __unmap_and_move(struct page *page, struct page *newpage,
 		goto out_unlock;
 
 	if (unlikely(!is_lru)) {
-		rc = move_to_new_folio(dst, folio, mode);
-		goto out_unlock_both;
+		__migrate_page_record(newpage, page_was_mapped, anon_vma);
+		return MIGRATEPAGE_UNMAP;
 	}
 
 	/*
@@ -1085,11 +1104,41 @@  static int __unmap_and_move(struct page *page, struct page *newpage,
 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
 				page);
 		try_to_migrate(folio, 0);
-		page_was_mapped = true;
+		page_was_mapped = 1;
+	}
+
+	if (!page_mapped(page)) {
+		__migrate_page_record(newpage, page_was_mapped, anon_vma);
+		return MIGRATEPAGE_UNMAP;
 	}
 
-	if (!page_mapped(page))
-		rc = move_to_new_folio(dst, folio, mode);
+	if (page_was_mapped)
+		remove_migration_ptes(folio, folio, false);
+
+out_unlock_both:
+	unlock_page(newpage);
+out_unlock:
+	/* Drop an anon_vma reference if we took one */
+	if (anon_vma)
+		put_anon_vma(anon_vma);
+	unlock_page(page);
+out:
+
+	return rc;
+}
+
+static int __migrate_page_move(struct page *page, struct page *newpage,
+			       enum migrate_mode mode)
+{
+	struct folio *folio = page_folio(page);
+	struct folio *dst = page_folio(newpage);
+	int rc;
+	int page_was_mapped = 0;
+	struct anon_vma *anon_vma = NULL;
+
+	__migrate_page_extract(newpage, &page_was_mapped, &anon_vma);
+
+	rc = move_to_new_folio(dst, folio, mode);
 
 	/*
 	 * When successful, push newpage to LRU immediately: so that if it
@@ -1110,14 +1159,11 @@  static int __unmap_and_move(struct page *page, struct page *newpage,
 		remove_migration_ptes(folio,
 			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
 
-out_unlock_both:
 	unlock_page(newpage);
-out_unlock:
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 	unlock_page(page);
-out:
 	/*
 	 * If migration is successful, decrease refcount of the newpage,
 	 * which will not free the page because new page owner increased
@@ -1129,18 +1175,31 @@  static int __unmap_and_move(struct page *page, struct page *newpage,
 	return rc;
 }
 
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page,
-				   free_page_t put_new_page,
-				   unsigned long private, struct page *page,
-				   int force, enum migrate_mode mode,
-				   enum migrate_reason reason,
-				   struct list_head *ret)
+static void migrate_page_done(struct page *page,
+			      enum migrate_reason reason)
+{
+	/*
+	 * Compaction can migrate also non-LRU pages which are
+	 * not accounted to NR_ISOLATED_*. They can be recognized
+	 * as __PageMovable
+	 */
+	if (likely(!__PageMovable(page)))
+		mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+				    page_is_file_lru(page), -thp_nr_pages(page));
+
+	if (reason != MR_MEMORY_FAILURE)
+		/* We release the page in page_handle_poison. */
+		put_page(page);
+}
+
+/* Obtain the lock on page, remove all ptes. */
+static int migrate_page_unmap(new_page_t get_new_page, free_page_t put_new_page,
+			      unsigned long private, struct page *page,
+			      struct page **newpagep, int force,
+			      enum migrate_mode mode, enum migrate_reason reason,
+			      struct list_head *ret)
 {
-	int rc = MIGRATEPAGE_SUCCESS;
+	int rc = MIGRATEPAGE_UNMAP;
 	struct page *newpage = NULL;
 
 	if (!thp_migration_supported() && PageTransHuge(page))
@@ -1151,19 +1210,48 @@  static int unmap_and_move(new_page_t get_new_page,
 		ClearPageActive(page);
 		ClearPageUnevictable(page);
 		/* free_pages_prepare() will clear PG_isolated. */
-		goto out;
+		list_del(&page->lru);
+		migrate_page_done(page, reason);
+		return MIGRATEPAGE_SUCCESS;
 	}
 
 	newpage = get_new_page(page, private);
 	if (!newpage)
 		return -ENOMEM;
+	*newpagep = newpage;
 
-	newpage->private = 0;
-	rc = __unmap_and_move(page, newpage, force, mode);
+	rc = __migrate_page_unmap(page, newpage, force, mode);
+	if (rc == MIGRATEPAGE_UNMAP)
+		return rc;
+
+	/*
+	 * A page that has not been migrated will have kept its
+	 * references and be restored.
+	 */
+	/* restore the page to right list. */
+	if (rc != -EAGAIN)
+		list_move_tail(&page->lru, ret);
+
+	if (put_new_page)
+		put_new_page(newpage, private);
+	else
+		put_page(newpage);
+
+	return rc;
+}
+
+/* Migrate the page to the newly allocated page in newpage. */
+static int migrate_page_move(free_page_t put_new_page, unsigned long private,
+			     struct page *page, struct page *newpage,
+			     enum migrate_mode mode, enum migrate_reason reason,
+			     struct list_head *ret)
+{
+	int rc;
+
+	rc = __migrate_page_move(page, newpage, mode);
 	if (rc == MIGRATEPAGE_SUCCESS)
 		set_page_owner_migrate_reason(newpage, reason);
 
-out:
 	if (rc != -EAGAIN) {
 		/*
 		 * A page that has been migrated has all references
@@ -1179,20 +1267,7 @@  static int unmap_and_move(new_page_t get_new_page,
 	 * we want to retry.
 	 */
 	if (rc == MIGRATEPAGE_SUCCESS) {
-		/*
-		 * Compaction can migrate also non-LRU pages which are
-		 * not accounted to NR_ISOLATED_*. They can be recognized
-		 * as __PageMovable
-		 */
-		if (likely(!__PageMovable(page)))
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_lru(page), -thp_nr_pages(page));
-
-		if (reason != MR_MEMORY_FAILURE)
-			/*
-			 * We release the page in page_handle_poison.
-			 */
-			put_page(page);
+		migrate_page_done(page, reason);
 	} else {
 		if (rc != -EAGAIN)
 			list_add_tail(&page->lru, ret);
@@ -1405,6 +1480,7 @@  int migrate_pages(struct list_head *from, new_page_t get_new_page,
 	int pass = 0;
 	bool is_thp = false;
 	struct page *page;
+	struct page *newpage = NULL;
 	struct page *page2;
 	int rc, nr_subpages;
 	LIST_HEAD(ret_pages);
@@ -1493,9 +1569,13 @@  int migrate_pages(struct list_head *from, new_page_t get_new_page,
 			if (PageHuge(page))
 				continue;
 
-			rc = unmap_and_move(get_new_page, put_new_page,
-						private, page, pass > 2, mode,
+			rc = migrate_page_unmap(get_new_page, put_new_page, private,
+						page, &newpage, pass > 2, mode,
 						reason, &ret_pages);
+			if (rc == MIGRATEPAGE_UNMAP)
+				rc = migrate_page_move(put_new_page, private,
+						       page, newpage, mode,
+						       reason, &ret_pages);
 			/*
 			 * The rules are:
 			 *	Success: page will be freed