diff mbox series

[RFC,3/3] mm/migrate: move common code to numa_migrate_check (was numa_migrate_prep)

Message ID 20240712024455.163543-4-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series Fix and refactor do_{huge_pmd_}numa_page() | expand

Commit Message

Zi Yan July 12, 2024, 2:44 a.m. UTC
From: Zi Yan <ziy@nvidia.com>

do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
reduce redundancy, move common code to numa_migrate_prep() and rename
the function to numa_migrate_check() to reflect its functionality.

There is some code difference between do_numa_page() and
do_huge_pmd_numa_page() before the code move:

1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
2. do_huge_pmd_numa_page() did not check and skip zone device folios.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/huge_memory.c | 28 ++++++-----------
 mm/internal.h    |  5 +--
 mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
 3 files changed, 52 insertions(+), 62 deletions(-)

Comments

Huang, Ying July 18, 2024, 8:36 a.m. UTC | #1
Zi Yan <zi.yan@sent.com> writes:

> From: Zi Yan <ziy@nvidia.com>
>
> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
> reduce redundancy, move common code to numa_migrate_prep() and rename
> the function to numa_migrate_check() to reflect its functionality.
>
> There is some code difference between do_numa_page() and
> do_huge_pmd_numa_page() before the code move:
>
> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  mm/huge_memory.c | 28 ++++++-----------
>  mm/internal.h    |  5 +--
>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>  3 files changed, 52 insertions(+), 62 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8c11d6da4b36..66d67d13e0dc 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  	pmd_t pmd;
>  	struct folio *folio;
>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	int nid = NUMA_NO_NODE;
> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
> +	int target_nid = NUMA_NO_NODE;
> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>  	bool writable = false;
> -	int flags = 0;
> +	int flags = 0, nr_pages;
>  
>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  		writable = true;
>  
>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
> -	if (!folio)
> +	if (!folio || folio_is_zone_device(folio))

This change appears unrelated.  Can we put it in a separate patch?

IIUC, this isn't necessary even in do_numa_page()?  Because in
change_pte_range(), folio_is_zone_device() has been checked already.
But It doesn't hurt too.

>  		goto out_map;
>  
> -	/* See similar comment in do_numa_page for explanation */
> -	if (!writable)
> -		flags |= TNF_NO_GROUP;
> +	nr_pages = folio_nr_pages(folio);
>  
> -	nid = folio_nid(folio);
> -	/*
> -	 * For memory tiering mode, cpupid of slow memory page is used
> -	 * to record page access time.  So use default value.
> -	 */
> -	if (folio_has_cpupid(folio))
> -		last_cpupid = folio_last_cpupid(folio);
> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
> +			&flags, &last_cpupid);
>  	if (target_nid == NUMA_NO_NODE)
>  		goto out_map;
>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  
>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>  		flags |= TNF_MIGRATED;
> -		nid = target_nid;
>  	} else {
> +		target_nid = NUMA_NO_NODE;
>  		flags |= TNF_MIGRATE_FAIL;
>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  	}
>  
>  out:
> -	if (nid != NUMA_NO_NODE)
> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
> +	if (target_nid != NUMA_NO_NODE)
> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);

This appears a behavior change.  IIUC, there are 2 possible issues.

1) if migrate_misplaced_folio() fails, folio_nid() should be used as
nid.  "target_nid" as variable name here is confusing, because
folio_nid() is needed in fact.

2) if !pmd_same(), task_numa_fault() should be skipped.  The original
code is buggy.

Similar issues for do_numa_page().

If my understanding were correct, we should implement a separate patch
to fix 2) above.  And that may need to be backported.

>  
>  	return 0;
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index b4d86436565b..7782b7bb3383 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1217,8 +1217,9 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
>  
>  void __vunmap_range_noflush(unsigned long start, unsigned long end);
>  
> -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
> -		      unsigned long addr, int page_nid, int *flags);
> +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
> +		      unsigned long addr, bool writable,
> +		      int *flags, int *last_cpupid);
>  
>  void free_zone_device_folio(struct folio *folio);
>  int migrate_device_coherent_page(struct page *page);
> diff --git a/mm/memory.c b/mm/memory.c
> index 96c2f5b3d796..a252c0f13755 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5209,16 +5209,42 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
>  	return ret;
>  }
>  
> -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
> -		      unsigned long addr, int page_nid, int *flags)
> +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
> +		      unsigned long addr, bool writable,
> +		      int *flags, int *last_cpupid)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  
> +	/*
> +	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
> +	 * much anyway since they can be in shared cache state. This misses
> +	 * the case where a mapping is writable but the process never writes
> +	 * to it but pte_write gets cleared during protection updates and
> +	 * pte_dirty has unpredictable behaviour between PTE scan updates,
> +	 * background writeback, dirty balancing and application behaviour.
> +	 */
> +	if (!writable)
> +		*flags |= TNF_NO_GROUP;
> +
> +	/*
> +	 * Flag if the folio is shared between multiple address spaces. This
> +	 * is later used when determining whether to group tasks together
> +	 */
> +	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
> +		*flags |= TNF_SHARED;
> +
> +	/*
> +	 * For memory tiering mode, cpupid of slow memory page is used
> +	 * to record page access time.
> +	 */
> +	if (folio_has_cpupid(folio))
> +		*last_cpupid = folio_last_cpupid(folio);
> +
>  	/* Record the current PID acceesing VMA */
>  	vma_set_access_pid_bit(vma);
>  
>  	count_vm_numa_event(NUMA_HINT_FAULTS);
> -	if (page_nid == numa_node_id()) {
> +	if (folio_nid(folio) == numa_node_id()) {
>  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
>  		*flags |= TNF_FAULT_LOCAL;
>  	}
> @@ -5284,12 +5310,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct folio *folio = NULL;
> -	int nid = NUMA_NO_NODE;
> +	int target_nid = NUMA_NO_NODE;
>  	bool writable = false, ignore_writable = false;
>  	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
> -	int last_cpupid;
> -	int target_nid;
> -	pte_t pte, old_pte;
> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
> +	pte_t pte, old_pte = vmf->orig_pte;
>  	int flags = 0, nr_pages;
>  
>  	/*
> @@ -5297,10 +5322,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  	 * table lock, that its contents have not changed during fault handling.
>  	 */
>  	spin_lock(vmf->ptl);
> -	/* Read the live PTE from the page tables: */
> -	old_pte = ptep_get(vmf->pte);
> -
> -	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
> +	if (unlikely(!pte_same(old_pte, *vmf->pte))) {
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);
>  		goto out;
>  	}
> @@ -5320,35 +5342,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  	if (!folio || folio_is_zone_device(folio))
>  		goto out_map;
>  
> -	/*
> -	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
> -	 * much anyway since they can be in shared cache state. This misses
> -	 * the case where a mapping is writable but the process never writes
> -	 * to it but pte_write gets cleared during protection updates and
> -	 * pte_dirty has unpredictable behaviour between PTE scan updates,
> -	 * background writeback, dirty balancing and application behaviour.
> -	 */
> -	if (!writable)
> -		flags |= TNF_NO_GROUP;
> -
> -	/*
> -	 * Flag if the folio is shared between multiple address spaces. This
> -	 * is later used when determining whether to group tasks together
> -	 */
> -	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
> -		flags |= TNF_SHARED;
> -
> -	nid = folio_nid(folio);
>  	nr_pages = folio_nr_pages(folio);
> -	/*
> -	 * For memory tiering mode, cpupid of slow memory page is used
> -	 * to record page access time.  So use default value.
> -	 */
> -	if (!folio_has_cpupid(folio))
> -		last_cpupid = (-1 & LAST_CPUPID_MASK);
> -	else
> -		last_cpupid = folio_last_cpupid(folio);
> -	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
> +
> +	target_nid = numa_migrate_check(folio, vmf, vmf->address, writable,
> +			&flags, &last_cpupid);
>  	if (target_nid == NUMA_NO_NODE)
>  		goto out_map;
>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
> @@ -5362,9 +5359,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  
>  	/* Migrate to the requested node */
>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
> -		nid = target_nid;
>  		flags |= TNF_MIGRATED;
>  	} else {
> +		target_nid = NUMA_NO_NODE;
>  		flags |= TNF_MIGRATE_FAIL;
>  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>  					       vmf->address, &vmf->ptl);
> @@ -5378,8 +5375,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  	}
>  
>  out:
> -	if (nid != NUMA_NO_NODE)
> -		task_numa_fault(last_cpupid, nid, nr_pages, flags);
> +	if (target_nid != NUMA_NO_NODE)
> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>  	return 0;
>  out_map:
>  	/*

--
Best Regards,
Huang, Ying
Zi Yan July 18, 2024, 2:40 p.m. UTC | #2
On 18 Jul 2024, at 4:36, Huang, Ying wrote:

> Zi Yan <zi.yan@sent.com> writes:
>
>> From: Zi Yan <ziy@nvidia.com>
>>
>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>> reduce redundancy, move common code to numa_migrate_prep() and rename
>> the function to numa_migrate_check() to reflect its functionality.
>>
>> There is some code difference between do_numa_page() and
>> do_huge_pmd_numa_page() before the code move:
>>
>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>  mm/huge_memory.c | 28 ++++++-----------
>>  mm/internal.h    |  5 +--
>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 8c11d6da4b36..66d67d13e0dc 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  	pmd_t pmd;
>>  	struct folio *folio;
>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -	int nid = NUMA_NO_NODE;
>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>> +	int target_nid = NUMA_NO_NODE;
>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>  	bool writable = false;
>> -	int flags = 0;
>> +	int flags = 0, nr_pages;
>>
>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  		writable = true;
>>
>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>> -	if (!folio)
>> +	if (!folio || folio_is_zone_device(folio))
>
> This change appears unrelated.  Can we put it in a separate patch?
>
> IIUC, this isn't necessary even in do_numa_page()?  Because in
> change_pte_range(), folio_is_zone_device() has been checked already.
> But It doesn't hurt too.

OK, will make this change in a separate patch.

>
>>  		goto out_map;
>>
>> -	/* See similar comment in do_numa_page for explanation */
>> -	if (!writable)
>> -		flags |= TNF_NO_GROUP;
>> +	nr_pages = folio_nr_pages(folio);
>>
>> -	nid = folio_nid(folio);
>> -	/*
>> -	 * For memory tiering mode, cpupid of slow memory page is used
>> -	 * to record page access time.  So use default value.
>> -	 */
>> -	if (folio_has_cpupid(folio))
>> -		last_cpupid = folio_last_cpupid(folio);
>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>> +			&flags, &last_cpupid);
>>  	if (target_nid == NUMA_NO_NODE)
>>  		goto out_map;
>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>
>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>  		flags |= TNF_MIGRATED;
>> -		nid = target_nid;
>>  	} else {
>> +		target_nid = NUMA_NO_NODE;
>>  		flags |= TNF_MIGRATE_FAIL;
>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  	}
>>
>>  out:
>> -	if (nid != NUMA_NO_NODE)
>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>> +	if (target_nid != NUMA_NO_NODE)
>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>
> This appears a behavior change.  IIUC, there are 2 possible issues.
>
> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
> nid.  "target_nid" as variable name here is confusing, because
> folio_nid() is needed in fact.

Right. Will fix it in my code.

>
> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
> code is buggy.
>
> Similar issues for do_numa_page().
>
> If my understanding were correct, we should implement a separate patch
> to fix 2) above.  And that may need to be backported.

Got it. Here is my plan:

1. Send the first two patches in this series separately, since they
are separate fixes. They can be picked up right now.
2. Send a separate patch to fix 2) above with cc: stable.
3. Clean up this patch and send it separately.

Thank you for the review. :)

>
>>
>>  	return 0;
>>
>> diff --git a/mm/internal.h b/mm/internal.h
>> index b4d86436565b..7782b7bb3383 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -1217,8 +1217,9 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
>>
>>  void __vunmap_range_noflush(unsigned long start, unsigned long end);
>>
>> -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
>> -		      unsigned long addr, int page_nid, int *flags);
>> +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
>> +		      unsigned long addr, bool writable,
>> +		      int *flags, int *last_cpupid);
>>
>>  void free_zone_device_folio(struct folio *folio);
>>  int migrate_device_coherent_page(struct page *page);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 96c2f5b3d796..a252c0f13755 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5209,16 +5209,42 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
>>  	return ret;
>>  }
>>
>> -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
>> -		      unsigned long addr, int page_nid, int *flags)
>> +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
>> +		      unsigned long addr, bool writable,
>> +		      int *flags, int *last_cpupid)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>
>> +	/*
>> +	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
>> +	 * much anyway since they can be in shared cache state. This misses
>> +	 * the case where a mapping is writable but the process never writes
>> +	 * to it but pte_write gets cleared during protection updates and
>> +	 * pte_dirty has unpredictable behaviour between PTE scan updates,
>> +	 * background writeback, dirty balancing and application behaviour.
>> +	 */
>> +	if (!writable)
>> +		*flags |= TNF_NO_GROUP;
>> +
>> +	/*
>> +	 * Flag if the folio is shared between multiple address spaces. This
>> +	 * is later used when determining whether to group tasks together
>> +	 */
>> +	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
>> +		*flags |= TNF_SHARED;
>> +
>> +	/*
>> +	 * For memory tiering mode, cpupid of slow memory page is used
>> +	 * to record page access time.
>> +	 */
>> +	if (folio_has_cpupid(folio))
>> +		*last_cpupid = folio_last_cpupid(folio);
>> +
>>  	/* Record the current PID acceesing VMA */
>>  	vma_set_access_pid_bit(vma);
>>
>>  	count_vm_numa_event(NUMA_HINT_FAULTS);
>> -	if (page_nid == numa_node_id()) {
>> +	if (folio_nid(folio) == numa_node_id()) {
>>  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
>>  		*flags |= TNF_FAULT_LOCAL;
>>  	}
>> @@ -5284,12 +5310,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	struct folio *folio = NULL;
>> -	int nid = NUMA_NO_NODE;
>> +	int target_nid = NUMA_NO_NODE;
>>  	bool writable = false, ignore_writable = false;
>>  	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
>> -	int last_cpupid;
>> -	int target_nid;
>> -	pte_t pte, old_pte;
>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>> +	pte_t pte, old_pte = vmf->orig_pte;
>>  	int flags = 0, nr_pages;
>>
>>  	/*
>> @@ -5297,10 +5322,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>  	 * table lock, that its contents have not changed during fault handling.
>>  	 */
>>  	spin_lock(vmf->ptl);
>> -	/* Read the live PTE from the page tables: */
>> -	old_pte = ptep_get(vmf->pte);
>> -
>> -	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
>> +	if (unlikely(!pte_same(old_pte, *vmf->pte))) {
>>  		pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  		goto out;
>>  	}
>> @@ -5320,35 +5342,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>  	if (!folio || folio_is_zone_device(folio))
>>  		goto out_map;
>>
>> -	/*
>> -	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
>> -	 * much anyway since they can be in shared cache state. This misses
>> -	 * the case where a mapping is writable but the process never writes
>> -	 * to it but pte_write gets cleared during protection updates and
>> -	 * pte_dirty has unpredictable behaviour between PTE scan updates,
>> -	 * background writeback, dirty balancing and application behaviour.
>> -	 */
>> -	if (!writable)
>> -		flags |= TNF_NO_GROUP;
>> -
>> -	/*
>> -	 * Flag if the folio is shared between multiple address spaces. This
>> -	 * is later used when determining whether to group tasks together
>> -	 */
>> -	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
>> -		flags |= TNF_SHARED;
>> -
>> -	nid = folio_nid(folio);
>>  	nr_pages = folio_nr_pages(folio);
>> -	/*
>> -	 * For memory tiering mode, cpupid of slow memory page is used
>> -	 * to record page access time.  So use default value.
>> -	 */
>> -	if (!folio_has_cpupid(folio))
>> -		last_cpupid = (-1 & LAST_CPUPID_MASK);
>> -	else
>> -		last_cpupid = folio_last_cpupid(folio);
>> -	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
>> +
>> +	target_nid = numa_migrate_check(folio, vmf, vmf->address, writable,
>> +			&flags, &last_cpupid);
>>  	if (target_nid == NUMA_NO_NODE)
>>  		goto out_map;
>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>> @@ -5362,9 +5359,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>
>>  	/* Migrate to the requested node */
>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>> -		nid = target_nid;
>>  		flags |= TNF_MIGRATED;
>>  	} else {
>> +		target_nid = NUMA_NO_NODE;
>>  		flags |= TNF_MIGRATE_FAIL;
>>  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>  					       vmf->address, &vmf->ptl);
>> @@ -5378,8 +5375,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>  	}
>>
>>  out:
>> -	if (nid != NUMA_NO_NODE)
>> -		task_numa_fault(last_cpupid, nid, nr_pages, flags);
>> +	if (target_nid != NUMA_NO_NODE)
>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>>  	return 0;
>>  out_map:
>>  	/*
>
> --
> Best Regards,
> Huang, Ying


Best Regards,
Yan, Zi
Zi Yan July 19, 2024, 8:19 p.m. UTC | #3
On 18 Jul 2024, at 4:36, Huang, Ying wrote:

> Zi Yan <zi.yan@sent.com> writes:
>
>> From: Zi Yan <ziy@nvidia.com>
>>
>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>> reduce redundancy, move common code to numa_migrate_prep() and rename
>> the function to numa_migrate_check() to reflect its functionality.
>>
>> There is some code difference between do_numa_page() and
>> do_huge_pmd_numa_page() before the code move:
>>
>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>  mm/huge_memory.c | 28 ++++++-----------
>>  mm/internal.h    |  5 +--
>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 8c11d6da4b36..66d67d13e0dc 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  	pmd_t pmd;
>>  	struct folio *folio;
>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -	int nid = NUMA_NO_NODE;
>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>> +	int target_nid = NUMA_NO_NODE;
>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>  	bool writable = false;
>> -	int flags = 0;
>> +	int flags = 0, nr_pages;
>>
>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  		writable = true;
>>
>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>> -	if (!folio)
>> +	if (!folio || folio_is_zone_device(folio))
>
> This change appears unrelated.  Can we put it in a separate patch?
>
> IIUC, this isn't necessary even in do_numa_page()?  Because in
> change_pte_range(), folio_is_zone_device() has been checked already.
> But It doesn't hurt too.
>
>>  		goto out_map;
>>
>> -	/* See similar comment in do_numa_page for explanation */
>> -	if (!writable)
>> -		flags |= TNF_NO_GROUP;
>> +	nr_pages = folio_nr_pages(folio);
>>
>> -	nid = folio_nid(folio);
>> -	/*
>> -	 * For memory tiering mode, cpupid of slow memory page is used
>> -	 * to record page access time.  So use default value.
>> -	 */
>> -	if (folio_has_cpupid(folio))
>> -		last_cpupid = folio_last_cpupid(folio);
>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>> +			&flags, &last_cpupid);
>>  	if (target_nid == NUMA_NO_NODE)
>>  		goto out_map;
>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>
>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>  		flags |= TNF_MIGRATED;
>> -		nid = target_nid;
>>  	} else {
>> +		target_nid = NUMA_NO_NODE;
>>  		flags |= TNF_MIGRATE_FAIL;
>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>  	}
>>
>>  out:
>> -	if (nid != NUMA_NO_NODE)
>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>> +	if (target_nid != NUMA_NO_NODE)
>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>
> This appears a behavior change.  IIUC, there are 2 possible issues.
>
> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
> nid.  "target_nid" as variable name here is confusing, because
> folio_nid() is needed in fact.
>
> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
> code is buggy.
>
> Similar issues for do_numa_page().
>
> If my understanding were correct, we should implement a separate patch
> to fix 2) above.  And that may need to be backported.

Hmm, the original code seems OK after I checked the implementation.
There are two possible !pte_same()/!pmd_same() locations:
1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
called.

2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
has been determined and checked. task_numa_fault() should be called even if
!pte_same()/!pmd_same(),

Let me know if I get this wrong. Thanks.


Best Regards,
Yan, Zi
Huang, Ying July 22, 2024, 1:47 a.m. UTC | #4
Zi Yan <ziy@nvidia.com> writes:

> On 18 Jul 2024, at 4:36, Huang, Ying wrote:
>
>> Zi Yan <zi.yan@sent.com> writes:
>>
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>>> reduce redundancy, move common code to numa_migrate_prep() and rename
>>> the function to numa_migrate_check() to reflect its functionality.
>>>
>>> There is some code difference between do_numa_page() and
>>> do_huge_pmd_numa_page() before the code move:
>>>
>>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>  mm/huge_memory.c | 28 ++++++-----------
>>>  mm/internal.h    |  5 +--
>>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 8c11d6da4b36..66d67d13e0dc 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>  	pmd_t pmd;
>>>  	struct folio *folio;
>>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> -	int nid = NUMA_NO_NODE;
>>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>>> +	int target_nid = NUMA_NO_NODE;
>>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>  	bool writable = false;
>>> -	int flags = 0;
>>> +	int flags = 0, nr_pages;
>>>
>>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>  		writable = true;
>>>
>>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>>> -	if (!folio)
>>> +	if (!folio || folio_is_zone_device(folio))
>>
>> This change appears unrelated.  Can we put it in a separate patch?
>>
>> IIUC, this isn't necessary even in do_numa_page()?  Because in
>> change_pte_range(), folio_is_zone_device() has been checked already.
>> But It doesn't hurt too.
>>
>>>  		goto out_map;
>>>
>>> -	/* See similar comment in do_numa_page for explanation */
>>> -	if (!writable)
>>> -		flags |= TNF_NO_GROUP;
>>> +	nr_pages = folio_nr_pages(folio);
>>>
>>> -	nid = folio_nid(folio);
>>> -	/*
>>> -	 * For memory tiering mode, cpupid of slow memory page is used
>>> -	 * to record page access time.  So use default value.
>>> -	 */
>>> -	if (folio_has_cpupid(folio))
>>> -		last_cpupid = folio_last_cpupid(folio);
>>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>>> +			&flags, &last_cpupid);
>>>  	if (target_nid == NUMA_NO_NODE)
>>>  		goto out_map;
>>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>
>>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>>  		flags |= TNF_MIGRATED;
>>> -		nid = target_nid;
>>>  	} else {
>>> +		target_nid = NUMA_NO_NODE;
>>>  		flags |= TNF_MIGRATE_FAIL;
>>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>  	}
>>>
>>>  out:
>>> -	if (nid != NUMA_NO_NODE)
>>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>>> +	if (target_nid != NUMA_NO_NODE)
>>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>>
>> This appears a behavior change.  IIUC, there are 2 possible issues.
>>
>> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
>> nid.  "target_nid" as variable name here is confusing, because
>> folio_nid() is needed in fact.
>>
>> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
>> code is buggy.
>>
>> Similar issues for do_numa_page().
>>
>> If my understanding were correct, we should implement a separate patch
>> to fix 2) above.  And that may need to be backported.
>
> Hmm, the original code seems OK after I checked the implementation.
> There are two possible !pte_same()/!pmd_same() locations:
> 1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
> PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
> called.

Yes.

> 2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
> has been determined and checked. task_numa_fault() should be called even if
> !pte_same()/!pmd_same(),

IIUC, if !pte_same()/!pmd_same(), the fault has been processed on
another CPU.  For example, do_numa_page()/do_huge_pmd_numa_page() has
been called on another CPU and task_numa_fault() has been called for the
PTE/PMD already.

> Let me know if I get this wrong. Thanks.
>

--
Best Regards,
Huang, Ying
Zi Yan July 22, 2024, 2:01 p.m. UTC | #5
On 21 Jul 2024, at 21:47, Huang, Ying wrote:

> Zi Yan <ziy@nvidia.com> writes:
>
>> On 18 Jul 2024, at 4:36, Huang, Ying wrote:
>>
>>> Zi Yan <zi.yan@sent.com> writes:
>>>
>>>> From: Zi Yan <ziy@nvidia.com>
>>>>
>>>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>>>> reduce redundancy, move common code to numa_migrate_prep() and rename
>>>> the function to numa_migrate_check() to reflect its functionality.
>>>>
>>>> There is some code difference between do_numa_page() and
>>>> do_huge_pmd_numa_page() before the code move:
>>>>
>>>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>>>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>>>
>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>> ---
>>>>  mm/huge_memory.c | 28 ++++++-----------
>>>>  mm/internal.h    |  5 +--
>>>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 8c11d6da4b36..66d67d13e0dc 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>  	pmd_t pmd;
>>>>  	struct folio *folio;
>>>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>> -	int nid = NUMA_NO_NODE;
>>>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>> +	int target_nid = NUMA_NO_NODE;
>>>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>>  	bool writable = false;
>>>> -	int flags = 0;
>>>> +	int flags = 0, nr_pages;
>>>>
>>>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>  		writable = true;
>>>>
>>>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>>>> -	if (!folio)
>>>> +	if (!folio || folio_is_zone_device(folio))
>>>
>>> This change appears unrelated.  Can we put it in a separate patch?
>>>
>>> IIUC, this isn't necessary even in do_numa_page()?  Because in
>>> change_pte_range(), folio_is_zone_device() has been checked already.
>>> But It doesn't hurt too.
>>>
>>>>  		goto out_map;
>>>>
>>>> -	/* See similar comment in do_numa_page for explanation */
>>>> -	if (!writable)
>>>> -		flags |= TNF_NO_GROUP;
>>>> +	nr_pages = folio_nr_pages(folio);
>>>>
>>>> -	nid = folio_nid(folio);
>>>> -	/*
>>>> -	 * For memory tiering mode, cpupid of slow memory page is used
>>>> -	 * to record page access time.  So use default value.
>>>> -	 */
>>>> -	if (folio_has_cpupid(folio))
>>>> -		last_cpupid = folio_last_cpupid(folio);
>>>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>>>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>>>> +			&flags, &last_cpupid);
>>>>  	if (target_nid == NUMA_NO_NODE)
>>>>  		goto out_map;
>>>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>>>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>
>>>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>>>  		flags |= TNF_MIGRATED;
>>>> -		nid = target_nid;
>>>>  	} else {
>>>> +		target_nid = NUMA_NO_NODE;
>>>>  		flags |= TNF_MIGRATE_FAIL;
>>>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>  	}
>>>>
>>>>  out:
>>>> -	if (nid != NUMA_NO_NODE)
>>>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>>>> +	if (target_nid != NUMA_NO_NODE)
>>>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>>>
>>> This appears a behavior change.  IIUC, there are 2 possible issues.
>>>
>>> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
>>> nid.  "target_nid" as variable name here is confusing, because
>>> folio_nid() is needed in fact.
>>>
>>> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
>>> code is buggy.
>>>
>>> Similar issues for do_numa_page().
>>>
>>> If my understanding were correct, we should implement a separate patch
>>> to fix 2) above.  And that may need to be backported.
>>
>> Hmm, the original code seems OK after I checked the implementation.
>> There are two possible !pte_same()/!pmd_same() locations:
>> 1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
>> PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
>> called.
>
> Yes.
>
>> 2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
>> has been determined and checked. task_numa_fault() should be called even if
>> !pte_same()/!pmd_same(),
>
> IIUC, if !pte_same()/!pmd_same(), the fault has been processed on
> another CPU.  For example, do_numa_page()/do_huge_pmd_numa_page() has
> been called on another CPU and task_numa_fault() has been called for the
> PTE/PMD already.

Hmm, this behavior at least dates back to 2015 at
commit 074c238177a7 ("mm: numa: slow PTE scan rate if migration failures occur”).
So cc Mel.

The code is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memory.c?id=074c238177a75f5e79af3b2cb6a84e54823ef950#n3102. I have not checked older
commits.

I wonder how far we should trace back.


Best Regards,
Yan, Zi
Zi Yan July 22, 2024, 3:21 p.m. UTC | #6
On 22 Jul 2024, at 10:01, Zi Yan wrote:

> On 21 Jul 2024, at 21:47, Huang, Ying wrote:
>
>> Zi Yan <ziy@nvidia.com> writes:
>>
>>> On 18 Jul 2024, at 4:36, Huang, Ying wrote:
>>>
>>>> Zi Yan <zi.yan@sent.com> writes:
>>>>
>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>
>>>>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>>>>> reduce redundancy, move common code to numa_migrate_prep() and rename
>>>>> the function to numa_migrate_check() to reflect its functionality.
>>>>>
>>>>> There is some code difference between do_numa_page() and
>>>>> do_huge_pmd_numa_page() before the code move:
>>>>>
>>>>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>>>>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>>>>
>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>> ---
>>>>>  mm/huge_memory.c | 28 ++++++-----------
>>>>>  mm/internal.h    |  5 +--
>>>>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>>>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 8c11d6da4b36..66d67d13e0dc 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>  	pmd_t pmd;
>>>>>  	struct folio *folio;
>>>>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>> -	int nid = NUMA_NO_NODE;
>>>>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>>> +	int target_nid = NUMA_NO_NODE;
>>>>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>>>  	bool writable = false;
>>>>> -	int flags = 0;
>>>>> +	int flags = 0, nr_pages;
>>>>>
>>>>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>  		writable = true;
>>>>>
>>>>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>>>>> -	if (!folio)
>>>>> +	if (!folio || folio_is_zone_device(folio))
>>>>
>>>> This change appears unrelated.  Can we put it in a separate patch?
>>>>
>>>> IIUC, this isn't necessary even in do_numa_page()?  Because in
>>>> change_pte_range(), folio_is_zone_device() has been checked already.
>>>> But It doesn't hurt too.
>>>>
>>>>>  		goto out_map;
>>>>>
>>>>> -	/* See similar comment in do_numa_page for explanation */
>>>>> -	if (!writable)
>>>>> -		flags |= TNF_NO_GROUP;
>>>>> +	nr_pages = folio_nr_pages(folio);
>>>>>
>>>>> -	nid = folio_nid(folio);
>>>>> -	/*
>>>>> -	 * For memory tiering mode, cpupid of slow memory page is used
>>>>> -	 * to record page access time.  So use default value.
>>>>> -	 */
>>>>> -	if (folio_has_cpupid(folio))
>>>>> -		last_cpupid = folio_last_cpupid(folio);
>>>>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>>>>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>>>>> +			&flags, &last_cpupid);
>>>>>  	if (target_nid == NUMA_NO_NODE)
>>>>>  		goto out_map;
>>>>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>>>>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>
>>>>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>>>>  		flags |= TNF_MIGRATED;
>>>>> -		nid = target_nid;
>>>>>  	} else {
>>>>> +		target_nid = NUMA_NO_NODE;
>>>>>  		flags |= TNF_MIGRATE_FAIL;
>>>>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>  	}
>>>>>
>>>>>  out:
>>>>> -	if (nid != NUMA_NO_NODE)
>>>>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>>>>> +	if (target_nid != NUMA_NO_NODE)
>>>>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>>>>
>>>> This appears a behavior change.  IIUC, there are 2 possible issues.
>>>>
>>>> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
>>>> nid.  "target_nid" as variable name here is confusing, because
>>>> folio_nid() is needed in fact.
>>>>
>>>> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
>>>> code is buggy.
>>>>
>>>> Similar issues for do_numa_page().
>>>>
>>>> If my understanding were correct, we should implement a separate patch
>>>> to fix 2) above.  And that may need to be backported.
>>>
>>> Hmm, the original code seems OK after I checked the implementation.
>>> There are two possible !pte_same()/!pmd_same() locations:
>>> 1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
>>> PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
>>> called.
>>
>> Yes.
>>
>>> 2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
>>> has been determined and checked. task_numa_fault() should be called even if
>>> !pte_same()/!pmd_same(),
>>
>> IIUC, if !pte_same()/!pmd_same(), the fault has been processed on
>> another CPU.  For example, do_numa_page()/do_huge_pmd_numa_page() has
>> been called on another CPU and task_numa_fault() has been called for the
>> PTE/PMD already.
>
> Hmm, this behavior at least dates back to 2015 at
> commit 074c238177a7 ("mm: numa: slow PTE scan rate if migration failures occur”).
> So cc Mel.
>
> The code is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memory.c?id=074c238177a75f5e79af3b2cb6a84e54823ef950#n3102. I have not checked older
> commits.
>
> I wonder how far we should trace back.

OK, I find the commit where task_numa_fault policy settled:
8191acbd30c7 ("mm: numa: Sanitize task_numa_fault() callsites”).

It says:
“So modify all three sites to always account; we did after all receive
the fault; and always account to where the page is after migration,
regardless of success.“, where the three call sites were:
do_huge_pmd_numa_page(), do_numa_page(), and do_pmd_numa_page().

The current code still follows what the commit log does.


Best Regards,
Yan, Zi
Huang, Ying July 23, 2024, 1:16 a.m. UTC | #7
Zi Yan <ziy@nvidia.com> writes:

> On 22 Jul 2024, at 10:01, Zi Yan wrote:
>
>> On 21 Jul 2024, at 21:47, Huang, Ying wrote:
>>
>>> Zi Yan <ziy@nvidia.com> writes:
>>>
>>>> On 18 Jul 2024, at 4:36, Huang, Ying wrote:
>>>>
>>>>> Zi Yan <zi.yan@sent.com> writes:
>>>>>
>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>
>>>>>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
>>>>>> reduce redundancy, move common code to numa_migrate_prep() and rename
>>>>>> the function to numa_migrate_check() to reflect its functionality.
>>>>>>
>>>>>> There is some code difference between do_numa_page() and
>>>>>> do_huge_pmd_numa_page() before the code move:
>>>>>>
>>>>>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
>>>>>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
>>>>>>
>>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>>> ---
>>>>>>  mm/huge_memory.c | 28 ++++++-----------
>>>>>>  mm/internal.h    |  5 +--
>>>>>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
>>>>>>  3 files changed, 52 insertions(+), 62 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>> index 8c11d6da4b36..66d67d13e0dc 100644
>>>>>> --- a/mm/huge_memory.c
>>>>>> +++ b/mm/huge_memory.c
>>>>>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>>  	pmd_t pmd;
>>>>>>  	struct folio *folio;
>>>>>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>>> -	int nid = NUMA_NO_NODE;
>>>>>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>>>> +	int target_nid = NUMA_NO_NODE;
>>>>>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
>>>>>>  	bool writable = false;
>>>>>> -	int flags = 0;
>>>>>> +	int flags = 0, nr_pages;
>>>>>>
>>>>>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>>>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>>  		writable = true;
>>>>>>
>>>>>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
>>>>>> -	if (!folio)
>>>>>> +	if (!folio || folio_is_zone_device(folio))
>>>>>
>>>>> This change appears unrelated.  Can we put it in a separate patch?
>>>>>
>>>>> IIUC, this isn't necessary even in do_numa_page()?  Because in
>>>>> change_pte_range(), folio_is_zone_device() has been checked already.
>>>>> But It doesn't hurt too.
>>>>>
>>>>>>  		goto out_map;
>>>>>>
>>>>>> -	/* See similar comment in do_numa_page for explanation */
>>>>>> -	if (!writable)
>>>>>> -		flags |= TNF_NO_GROUP;
>>>>>> +	nr_pages = folio_nr_pages(folio);
>>>>>>
>>>>>> -	nid = folio_nid(folio);
>>>>>> -	/*
>>>>>> -	 * For memory tiering mode, cpupid of slow memory page is used
>>>>>> -	 * to record page access time.  So use default value.
>>>>>> -	 */
>>>>>> -	if (folio_has_cpupid(folio))
>>>>>> -		last_cpupid = folio_last_cpupid(folio);
>>>>>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
>>>>>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
>>>>>> +			&flags, &last_cpupid);
>>>>>>  	if (target_nid == NUMA_NO_NODE)
>>>>>>  		goto out_map;
>>>>>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
>>>>>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>>
>>>>>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
>>>>>>  		flags |= TNF_MIGRATED;
>>>>>> -		nid = target_nid;
>>>>>>  	} else {
>>>>>> +		target_nid = NUMA_NO_NODE;
>>>>>>  		flags |= TNF_MIGRATE_FAIL;
>>>>>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>>>>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>>>>>  	}
>>>>>>
>>>>>>  out:
>>>>>> -	if (nid != NUMA_NO_NODE)
>>>>>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
>>>>>> +	if (target_nid != NUMA_NO_NODE)
>>>>>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
>>>>>
>>>>> This appears a behavior change.  IIUC, there are 2 possible issues.
>>>>>
>>>>> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
>>>>> nid.  "target_nid" as variable name here is confusing, because
>>>>> folio_nid() is needed in fact.
>>>>>
>>>>> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
>>>>> code is buggy.
>>>>>
>>>>> Similar issues for do_numa_page().
>>>>>
>>>>> If my understanding were correct, we should implement a separate patch
>>>>> to fix 2) above.  And that may need to be backported.
>>>>
>>>> Hmm, the original code seems OK after I checked the implementation.
>>>> There are two possible !pte_same()/!pmd_same() locations:
>>>> 1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
>>>> PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
>>>> called.
>>>
>>> Yes.
>>>
>>>> 2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
>>>> has been determined and checked. task_numa_fault() should be called even if
>>>> !pte_same()/!pmd_same(),
>>>
>>> IIUC, if !pte_same()/!pmd_same(), the fault has been processed on
>>> another CPU.  For example, do_numa_page()/do_huge_pmd_numa_page() has
>>> been called on another CPU and task_numa_fault() has been called for the
>>> PTE/PMD already.
>>
>> Hmm, this behavior at least dates back to 2015 at
>> commit 074c238177a7 ("mm: numa: slow PTE scan rate if migration failures occur”).
>> So cc Mel.
>>
>> The code is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memory.c?id=074c238177a75f5e79af3b2cb6a84e54823ef950#n3102. I have not checked older
>> commits.
>>
>> I wonder how far we should trace back.
>
> OK, I find the commit where task_numa_fault policy settled:
> 8191acbd30c7 ("mm: numa: Sanitize task_numa_fault() callsites”).
>
> It says:
> “So modify all three sites to always account; we did after all receive
> the fault; and always account to where the page is after migration,
> regardless of success.“, where the three call sites were:
> do_huge_pmd_numa_page(), do_numa_page(), and do_pmd_numa_page().
>
> The current code still follows what the commit log does.

Per my understanding, the issue is introduced in commit b99a342d4f11
("NUMA balancing: reduce TLB flush via delaying mapping on hint page
fault").  Before that, the PTE is restored before migration, so
task_numa_fault() should be called for migration failure too.  After
that, the PTE is restored after migration failure, if the PTE has been
changed by someone else, someone else should have called
task_numa_fault() if necessary, we shouldn't call it again.

--
Best Regards,
Huang, Ying
Zi Yan July 23, 2024, 1:43 a.m. UTC | #8
On Mon Jul 22, 2024 at 9:16 PM EDT, Huang, Ying wrote:
> Zi Yan <ziy@nvidia.com> writes:
>
> > On 22 Jul 2024, at 10:01, Zi Yan wrote:
> >
> >> On 21 Jul 2024, at 21:47, Huang, Ying wrote:
> >>
> >>> Zi Yan <ziy@nvidia.com> writes:
> >>>
> >>>> On 18 Jul 2024, at 4:36, Huang, Ying wrote:
> >>>>
> >>>>> Zi Yan <zi.yan@sent.com> writes:
> >>>>>
> >>>>>> From: Zi Yan <ziy@nvidia.com>
> >>>>>>
> >>>>>> do_numa_page() and do_huge_pmd_numa_page() share a lot of common code. To
> >>>>>> reduce redundancy, move common code to numa_migrate_prep() and rename
> >>>>>> the function to numa_migrate_check() to reflect its functionality.
> >>>>>>
> >>>>>> There is some code difference between do_numa_page() and
> >>>>>> do_huge_pmd_numa_page() before the code move:
> >>>>>>
> >>>>>> 1. do_huge_pmd_numa_page() did not check shared folios to set TNF_SHARED.
> >>>>>> 2. do_huge_pmd_numa_page() did not check and skip zone device folios.
> >>>>>>
> >>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
> >>>>>> ---
> >>>>>>  mm/huge_memory.c | 28 ++++++-----------
> >>>>>>  mm/internal.h    |  5 +--
> >>>>>>  mm/memory.c      | 81 +++++++++++++++++++++++-------------------------
> >>>>>>  3 files changed, 52 insertions(+), 62 deletions(-)
> >>>>>>
> >>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>> index 8c11d6da4b36..66d67d13e0dc 100644
> >>>>>> --- a/mm/huge_memory.c
> >>>>>> +++ b/mm/huge_memory.c
> >>>>>> @@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> >>>>>>  	pmd_t pmd;
> >>>>>>  	struct folio *folio;
> >>>>>>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> >>>>>> -	int nid = NUMA_NO_NODE;
> >>>>>> -	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
> >>>>>> +	int target_nid = NUMA_NO_NODE;
> >>>>>> +	int last_cpupid = (-1 & LAST_CPUPID_MASK);
> >>>>>>  	bool writable = false;
> >>>>>> -	int flags = 0;
> >>>>>> +	int flags = 0, nr_pages;
> >>>>>>
> >>>>>>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> >>>>>>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
> >>>>>> @@ -1693,21 +1693,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> >>>>>>  		writable = true;
> >>>>>>
> >>>>>>  	folio = vm_normal_folio_pmd(vma, haddr, pmd);
> >>>>>> -	if (!folio)
> >>>>>> +	if (!folio || folio_is_zone_device(folio))
> >>>>>
> >>>>> This change appears unrelated.  Can we put it in a separate patch?
> >>>>>
> >>>>> IIUC, this isn't necessary even in do_numa_page()?  Because in
> >>>>> change_pte_range(), folio_is_zone_device() has been checked already.
> >>>>> But It doesn't hurt too.
> >>>>>
> >>>>>>  		goto out_map;
> >>>>>>
> >>>>>> -	/* See similar comment in do_numa_page for explanation */
> >>>>>> -	if (!writable)
> >>>>>> -		flags |= TNF_NO_GROUP;
> >>>>>> +	nr_pages = folio_nr_pages(folio);
> >>>>>>
> >>>>>> -	nid = folio_nid(folio);
> >>>>>> -	/*
> >>>>>> -	 * For memory tiering mode, cpupid of slow memory page is used
> >>>>>> -	 * to record page access time.  So use default value.
> >>>>>> -	 */
> >>>>>> -	if (folio_has_cpupid(folio))
> >>>>>> -		last_cpupid = folio_last_cpupid(folio);
> >>>>>> -	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
> >>>>>> +	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
> >>>>>> +			&flags, &last_cpupid);
> >>>>>>  	if (target_nid == NUMA_NO_NODE)
> >>>>>>  		goto out_map;
> >>>>>>  	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
> >>>>>> @@ -1720,8 +1712,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> >>>>>>
> >>>>>>  	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
> >>>>>>  		flags |= TNF_MIGRATED;
> >>>>>> -		nid = target_nid;
> >>>>>>  	} else {
> >>>>>> +		target_nid = NUMA_NO_NODE;
> >>>>>>  		flags |= TNF_MIGRATE_FAIL;
> >>>>>>  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> >>>>>>  		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
> >>>>>> @@ -1732,8 +1724,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> >>>>>>  	}
> >>>>>>
> >>>>>>  out:
> >>>>>> -	if (nid != NUMA_NO_NODE)
> >>>>>> -		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
> >>>>>> +	if (target_nid != NUMA_NO_NODE)
> >>>>>> +		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
> >>>>>
> >>>>> This appears a behavior change.  IIUC, there are 2 possible issues.
> >>>>>
> >>>>> 1) if migrate_misplaced_folio() fails, folio_nid() should be used as
> >>>>> nid.  "target_nid" as variable name here is confusing, because
> >>>>> folio_nid() is needed in fact.
> >>>>>
> >>>>> 2) if !pmd_same(), task_numa_fault() should be skipped.  The original
> >>>>> code is buggy.
> >>>>>
> >>>>> Similar issues for do_numa_page().
> >>>>>
> >>>>> If my understanding were correct, we should implement a separate patch
> >>>>> to fix 2) above.  And that may need to be backported.
> >>>>
> >>>> Hmm, the original code seems OK after I checked the implementation.
> >>>> There are two possible !pte_same()/!pmd_same() locations:
> >>>> 1) at the beginning of do_numa_page() and do_huge_pmd_numa_page() and the faulted
> >>>> PTE/PMD changed before the folio can be checked, task_numa_fault() should not be
> >>>> called.
> >>>
> >>> Yes.
> >>>
> >>>> 2) when migrate_misplaced_folio() failed and the PTE/PMD changed, but the folio
> >>>> has been determined and checked. task_numa_fault() should be called even if
> >>>> !pte_same()/!pmd_same(),
> >>>
> >>> IIUC, if !pte_same()/!pmd_same(), the fault has been processed on
> >>> another CPU.  For example, do_numa_page()/do_huge_pmd_numa_page() has
> >>> been called on another CPU and task_numa_fault() has been called for the
> >>> PTE/PMD already.
> >>
> >> Hmm, this behavior at least dates back to 2015 at
> >> commit 074c238177a7 ("mm: numa: slow PTE scan rate if migration failures occur”).
> >> So cc Mel.
> >>
> >> The code is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memory.c?id=074c238177a75f5e79af3b2cb6a84e54823ef950#n3102. I have not checked older
> >> commits.
> >>
> >> I wonder how far we should trace back.
> >
> > OK, I find the commit where task_numa_fault policy settled:
> > 8191acbd30c7 ("mm: numa: Sanitize task_numa_fault() callsites”).
> >
> > It says:
> > “So modify all three sites to always account; we did after all receive
> > the fault; and always account to where the page is after migration,
> > regardless of success.“, where the three call sites were:
> > do_huge_pmd_numa_page(), do_numa_page(), and do_pmd_numa_page().
> >
> > The current code still follows what the commit log does.
>
> Per my understanding, the issue is introduced in commit b99a342d4f11
> ("NUMA balancing: reduce TLB flush via delaying mapping on hint page
> fault").  Before that, the PTE is restored before migration, so
> task_numa_fault() should be called for migration failure too.  After
> that, the PTE is restored after migration failure, if the PTE has been
> changed by someone else, someone else should have called
> task_numa_fault() if necessary, we shouldn't call it again.

You are right. Will fix the issue. Thank you for the explanation.
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8c11d6da4b36..66d67d13e0dc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1670,10 +1670,10 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	pmd_t pmd;
 	struct folio *folio;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	int nid = NUMA_NO_NODE;
-	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
+	int target_nid = NUMA_NO_NODE;
+	int last_cpupid = (-1 & LAST_CPUPID_MASK);
 	bool writable = false;
-	int flags = 0;
+	int flags = 0, nr_pages;
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
@@ -1693,21 +1693,13 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 		writable = true;
 
 	folio = vm_normal_folio_pmd(vma, haddr, pmd);
-	if (!folio)
+	if (!folio || folio_is_zone_device(folio))
 		goto out_map;
 
-	/* See similar comment in do_numa_page for explanation */
-	if (!writable)
-		flags |= TNF_NO_GROUP;
+	nr_pages = folio_nr_pages(folio);
 
-	nid = folio_nid(folio);
-	/*
-	 * For memory tiering mode, cpupid of slow memory page is used
-	 * to record page access time.  So use default value.
-	 */
-	if (folio_has_cpupid(folio))
-		last_cpupid = folio_last_cpupid(folio);
-	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
+	target_nid = numa_migrate_check(folio, vmf, haddr, writable,
+			&flags, &last_cpupid);
 	if (target_nid == NUMA_NO_NODE)
 		goto out_map;
 	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
@@ -1720,8 +1712,8 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 
 	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
 		flags |= TNF_MIGRATED;
-		nid = target_nid;
 	} else {
+		target_nid = NUMA_NO_NODE;
 		flags |= TNF_MIGRATE_FAIL;
 		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
@@ -1732,8 +1724,8 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	}
 
 out:
-	if (nid != NUMA_NO_NODE)
-		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
+	if (target_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
 
 	return 0;
 
diff --git a/mm/internal.h b/mm/internal.h
index b4d86436565b..7782b7bb3383 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1217,8 +1217,9 @@  void vunmap_range_noflush(unsigned long start, unsigned long end);
 
 void __vunmap_range_noflush(unsigned long start, unsigned long end);
 
-int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
-		      unsigned long addr, int page_nid, int *flags);
+int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
+		      unsigned long addr, bool writable,
+		      int *flags, int *last_cpupid);
 
 void free_zone_device_folio(struct folio *folio);
 int migrate_device_coherent_page(struct page *page);
diff --git a/mm/memory.c b/mm/memory.c
index 96c2f5b3d796..a252c0f13755 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5209,16 +5209,42 @@  static vm_fault_t do_fault(struct vm_fault *vmf)
 	return ret;
 }
 
-int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
-		      unsigned long addr, int page_nid, int *flags)
+int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
+		      unsigned long addr, bool writable,
+		      int *flags, int *last_cpupid)
 {
 	struct vm_area_struct *vma = vmf->vma;
 
+	/*
+	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+	 * much anyway since they can be in shared cache state. This misses
+	 * the case where a mapping is writable but the process never writes
+	 * to it but pte_write gets cleared during protection updates and
+	 * pte_dirty has unpredictable behaviour between PTE scan updates,
+	 * background writeback, dirty balancing and application behaviour.
+	 */
+	if (!writable)
+		*flags |= TNF_NO_GROUP;
+
+	/*
+	 * Flag if the folio is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+		*flags |= TNF_SHARED;
+
+	/*
+	 * For memory tiering mode, cpupid of slow memory page is used
+	 * to record page access time.
+	 */
+	if (folio_has_cpupid(folio))
+		*last_cpupid = folio_last_cpupid(folio);
+
 	/* Record the current PID acceesing VMA */
 	vma_set_access_pid_bit(vma);
 
 	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (page_nid == numa_node_id()) {
+	if (folio_nid(folio) == numa_node_id()) {
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 		*flags |= TNF_FAULT_LOCAL;
 	}
@@ -5284,12 +5310,11 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio = NULL;
-	int nid = NUMA_NO_NODE;
+	int target_nid = NUMA_NO_NODE;
 	bool writable = false, ignore_writable = false;
 	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
-	int last_cpupid;
-	int target_nid;
-	pte_t pte, old_pte;
+	int last_cpupid = (-1 & LAST_CPUPID_MASK);
+	pte_t pte, old_pte = vmf->orig_pte;
 	int flags = 0, nr_pages;
 
 	/*
@@ -5297,10 +5322,7 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	 * table lock, that its contents have not changed during fault handling.
 	 */
 	spin_lock(vmf->ptl);
-	/* Read the live PTE from the page tables: */
-	old_pte = ptep_get(vmf->pte);
-
-	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
+	if (unlikely(!pte_same(old_pte, *vmf->pte))) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		goto out;
 	}
@@ -5320,35 +5342,10 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	if (!folio || folio_is_zone_device(folio))
 		goto out_map;
 
-	/*
-	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
-	 * much anyway since they can be in shared cache state. This misses
-	 * the case where a mapping is writable but the process never writes
-	 * to it but pte_write gets cleared during protection updates and
-	 * pte_dirty has unpredictable behaviour between PTE scan updates,
-	 * background writeback, dirty balancing and application behaviour.
-	 */
-	if (!writable)
-		flags |= TNF_NO_GROUP;
-
-	/*
-	 * Flag if the folio is shared between multiple address spaces. This
-	 * is later used when determining whether to group tasks together
-	 */
-	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
-		flags |= TNF_SHARED;
-
-	nid = folio_nid(folio);
 	nr_pages = folio_nr_pages(folio);
-	/*
-	 * For memory tiering mode, cpupid of slow memory page is used
-	 * to record page access time.  So use default value.
-	 */
-	if (!folio_has_cpupid(folio))
-		last_cpupid = (-1 & LAST_CPUPID_MASK);
-	else
-		last_cpupid = folio_last_cpupid(folio);
-	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
+
+	target_nid = numa_migrate_check(folio, vmf, vmf->address, writable,
+			&flags, &last_cpupid);
 	if (target_nid == NUMA_NO_NODE)
 		goto out_map;
 	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
@@ -5362,9 +5359,9 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 
 	/* Migrate to the requested node */
 	if (!migrate_misplaced_folio(folio, vma, target_nid)) {
-		nid = target_nid;
 		flags |= TNF_MIGRATED;
 	} else {
+		target_nid = NUMA_NO_NODE;
 		flags |= TNF_MIGRATE_FAIL;
 		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 					       vmf->address, &vmf->ptl);
@@ -5378,8 +5375,8 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	}
 
 out:
-	if (nid != NUMA_NO_NODE)
-		task_numa_fault(last_cpupid, nid, nr_pages, flags);
+	if (target_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, target_nid, nr_pages, flags);
 	return 0;
 out_map:
 	/*