diff mbox series

[1/5] rmap: move hugetlb try_to_unmap to dedicated function

Message ID 20230223083200.3149015-2-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series batched remove rmap in try_to_unmap_one() | expand

Commit Message

Yin, Fengwei Feb. 23, 2023, 8:31 a.m. UTC
It's to prepare the batched rmap update for large folio.
No need to looped handle hugetlb. Just handle hugetlb and
bail out early.

Almost no functional change. Just one change to mm counter
update.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/rmap.c | 205 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 126 insertions(+), 79 deletions(-)

Comments

Matthew Wilcox Feb. 23, 2023, 5:28 p.m. UTC | #1
On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
> It's to prepare the batched rmap update for large folio.
> No need to looped handle hugetlb. Just handle hugetlb and
> bail out early.
> 
> Almost no functional change. Just one change to mm counter
> update.

This looks like a very worthwhile cleanup in its own right.  Adding
various hugetlb & memory poisoning experts for commentary on the mm
counter change (search for three asterisks)

> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> ---
>  mm/rmap.c | 205 +++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 126 insertions(+), 79 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 15ae24585fc4..e7aa63b800f7 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1443,6 +1443,108 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
>  	munlock_vma_folio(folio, vma, compound);
>  }
>  
> +static bool try_to_unmap_one_hugetlb(struct folio *folio,
> +		struct vm_area_struct *vma, struct mmu_notifier_range range,
> +		struct page_vma_mapped_walk pvmw, unsigned long address,
> +		enum ttu_flags flags)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	pte_t pteval;
> +	bool ret = true, anon = folio_test_anon(folio);
> +
> +	/*
> +	 * The try_to_unmap() is only passed a hugetlb page
> +	 * in the case where the hugetlb page is poisoned.
> +	 */
> +	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
> +	/*
> +	 * huge_pmd_unshare may unmap an entire PMD page.
> +	 * There is no way of knowing exactly which PMDs may
> +	 * be cached for this mm, so we must flush them all.
> +	 * start/end were already adjusted above to cover this
> +	 * range.
> +	 */
> +	flush_cache_range(vma, range.start, range.end);
> +
> +	/*
> +	 * To call huge_pmd_unshare, i_mmap_rwsem must be
> +	 * held in write mode.  Caller needs to explicitly
> +	 * do this outside rmap routines.
> +	 *
> +	 * We also must hold hugetlb vma_lock in write mode.
> +	 * Lock order dictates acquiring vma_lock BEFORE
> +	 * i_mmap_rwsem.  We can only try lock here and fail
> +	 * if unsuccessful.
> +	 */
> +	if (!anon) {
> +		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> +		if (!hugetlb_vma_trylock_write(vma)) {
> +			ret = false;
> +			goto out;
> +		}
> +		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> +			hugetlb_vma_unlock_write(vma);
> +			flush_tlb_range(vma,
> +					range.start, range.end);
> +			mmu_notifier_invalidate_range(mm,
> +					range.start, range.end);
> +			/*
> +			 * The ref count of the PMD page was
> +			 * dropped which is part of the way map
> +			 * counting is done for shared PMDs.
> +			 * Return 'true' here.  When there is
> +			 * no other sharing, huge_pmd_unshare
> +			 * returns false and we will unmap the
> +			 * actual page and drop map count
> +			 * to zero.
> +			 */
> +			goto out;
> +		}
> +		hugetlb_vma_unlock_write(vma);
> +	}
> +	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> +
> +	/*
> +	 * Now the pte is cleared. If this pte was uffd-wp armed,
> +	 * we may want to replace a none pte with a marker pte if
> +	 * it's file-backed, so we don't lose the tracking info.
> +	 */
> +	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
> +
> +	/* Set the dirty flag on the folio now the pte is gone. */
> +	if (pte_dirty(pteval))
> +		folio_mark_dirty(folio);
> +
> +	/* Update high watermark before we lower rss */
> +	update_hiwater_rss(mm);
> +
> +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
> +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
> +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
> +	}
> +
> +	/*** try_to_unmap_one() called dec_mm_counter for
> +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
> +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
> +	 */
> +	hugetlb_count_sub(folio_nr_pages(folio), mm);
> +
> +	/*
> +	 * No need to call mmu_notifier_invalidate_range() it has be
> +	 * done above for all cases requiring it to happen under page
> +	 * table lock before mmu_notifier_invalidate_range_end()
> +	 *
> +	 * See Documentation/mm/mmu_notifier.rst
> +	 */
> +	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));
> +	if (vma->vm_flags & VM_LOCKED)
> +		mlock_drain_local();
> +	folio_put(folio);
> +
> +out:
> +	return ret;
> +}
> +
>  /*
>   * @arg: enum ttu_flags will be passed to this argument
>   */
> @@ -1506,86 +1608,37 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  			break;
>  		}
>  
> +		address = pvmw.address;
> +		if (folio_test_hugetlb(folio)) {
> +			ret = try_to_unmap_one_hugetlb(folio, vma, range,
> +							pvmw, address, flags);
> +
> +			/* no need to loop for hugetlb */
> +			page_vma_mapped_walk_done(&pvmw);
> +			break;
> +		}
> +
>  		subpage = folio_page(folio,
>  					pte_pfn(*pvmw.pte) - folio_pfn(folio));
> -		address = pvmw.address;
>  		anon_exclusive = folio_test_anon(folio) &&
>  				 PageAnonExclusive(subpage);
>  
> -		if (folio_test_hugetlb(folio)) {
> -			bool anon = folio_test_anon(folio);
> -
> +		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
> +		/* Nuke the page table entry. */
> +		if (should_defer_flush(mm, flags)) {
>  			/*
> -			 * The try_to_unmap() is only passed a hugetlb page
> -			 * in the case where the hugetlb page is poisoned.
> +			 * We clear the PTE but do not flush so potentially
> +			 * a remote CPU could still be writing to the folio.
> +			 * If the entry was previously clean then the
> +			 * architecture must guarantee that a clear->dirty
> +			 * transition on a cached TLB entry is written through
> +			 * and traps if the PTE is unmapped.
>  			 */
> -			VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
> -			/*
> -			 * huge_pmd_unshare may unmap an entire PMD page.
> -			 * There is no way of knowing exactly which PMDs may
> -			 * be cached for this mm, so we must flush them all.
> -			 * start/end were already adjusted above to cover this
> -			 * range.
> -			 */
> -			flush_cache_range(vma, range.start, range.end);
> +			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
>  
> -			/*
> -			 * To call huge_pmd_unshare, i_mmap_rwsem must be
> -			 * held in write mode.  Caller needs to explicitly
> -			 * do this outside rmap routines.
> -			 *
> -			 * We also must hold hugetlb vma_lock in write mode.
> -			 * Lock order dictates acquiring vma_lock BEFORE
> -			 * i_mmap_rwsem.  We can only try lock here and fail
> -			 * if unsuccessful.
> -			 */
> -			if (!anon) {
> -				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> -				if (!hugetlb_vma_trylock_write(vma)) {
> -					page_vma_mapped_walk_done(&pvmw);
> -					ret = false;
> -					break;
> -				}
> -				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> -					hugetlb_vma_unlock_write(vma);
> -					flush_tlb_range(vma,
> -						range.start, range.end);
> -					mmu_notifier_invalidate_range(mm,
> -						range.start, range.end);
> -					/*
> -					 * The ref count of the PMD page was
> -					 * dropped which is part of the way map
> -					 * counting is done for shared PMDs.
> -					 * Return 'true' here.  When there is
> -					 * no other sharing, huge_pmd_unshare
> -					 * returns false and we will unmap the
> -					 * actual page and drop map count
> -					 * to zero.
> -					 */
> -					page_vma_mapped_walk_done(&pvmw);
> -					break;
> -				}
> -				hugetlb_vma_unlock_write(vma);
> -			}
> -			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> +			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
>  		} else {
> -			flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
> -			/* Nuke the page table entry. */
> -			if (should_defer_flush(mm, flags)) {
> -				/*
> -				 * We clear the PTE but do not flush so potentially
> -				 * a remote CPU could still be writing to the folio.
> -				 * If the entry was previously clean then the
> -				 * architecture must guarantee that a clear->dirty
> -				 * transition on a cached TLB entry is written through
> -				 * and traps if the PTE is unmapped.
> -				 */
> -				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
> -
> -				set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
> -			} else {
> -				pteval = ptep_clear_flush(vma, address, pvmw.pte);
> -			}
> +			pteval = ptep_clear_flush(vma, address, pvmw.pte);
>  		}
>  
>  		/*
> @@ -1604,14 +1657,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  
>  		if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
>  			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
> -			if (folio_test_hugetlb(folio)) {
> -				hugetlb_count_sub(folio_nr_pages(folio), mm);
> -				set_huge_pte_at(mm, address, pvmw.pte, pteval);
> -			} else {
> -				dec_mm_counter(mm, mm_counter(&folio->page));
> -				set_pte_at(mm, address, pvmw.pte, pteval);
> -			}
> -
> +			dec_mm_counter(mm, mm_counter(&folio->page));
> +			set_pte_at(mm, address, pvmw.pte, pteval);
>  		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
>  			/*
>  			 * The guest indicated that the page content is of no
> -- 
> 2.30.2
>
Mike Kravetz Feb. 24, 2023, 12:20 a.m. UTC | #2
On 02/23/23 17:28, Matthew Wilcox wrote:
> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
> > It's to prepare the batched rmap update for large folio.
> > No need to looped handle hugetlb. Just handle hugetlb and
> > bail out early.
> > 
> > Almost no functional change. Just one change to mm counter
> > update.
> 
> This looks like a very worthwhile cleanup in its own right.  Adding
> various hugetlb & memory poisoning experts for commentary on the mm
> counter change (search for three asterisks)

The counter change looks correct to me.
However, there is a bunch of code in the else cases for

	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON))

in try_to_unmap_one.  Are you sure none of it applies to hugetlb pages?
It seems like some of the code in the

	} else if (folio_test_anon(folio)) {

could apply.  pte_uffd_wp perhaps?

Since you are cleaning up, I think the following make apply ...

> > +static bool try_to_unmap_one_hugetlb(struct folio *folio,
> > +		struct vm_area_struct *vma, struct mmu_notifier_range range,
> > +		struct page_vma_mapped_walk pvmw, unsigned long address,
> > +		enum ttu_flags flags)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	pte_t pteval;
> > +	bool ret = true, anon = folio_test_anon(folio);
> > +
> > +	/*
> > +	 * The try_to_unmap() is only passed a hugetlb page
> > +	 * in the case where the hugetlb page is poisoned.
> > +	 */
> > +	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
> > +	/*
> > +	 * huge_pmd_unshare may unmap an entire PMD page.
> > +	 * There is no way of knowing exactly which PMDs may
> > +	 * be cached for this mm, so we must flush them all.
> > +	 * start/end were already adjusted above to cover this
> > +	 * range.
> > +	 */
> > +	flush_cache_range(vma, range.start, range.end);
> > +
> > +	/*
> > +	 * To call huge_pmd_unshare, i_mmap_rwsem must be
> > +	 * held in write mode.  Caller needs to explicitly
> > +	 * do this outside rmap routines.
> > +	 *
> > +	 * We also must hold hugetlb vma_lock in write mode.
> > +	 * Lock order dictates acquiring vma_lock BEFORE
> > +	 * i_mmap_rwsem.  We can only try lock here and fail
> > +	 * if unsuccessful.
> > +	 */
> > +	if (!anon) {
> > +		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> > +		if (!hugetlb_vma_trylock_write(vma)) {
> > +			ret = false;
> > +			goto out;
> > +		}
> > +		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> > +			hugetlb_vma_unlock_write(vma);
> > +			flush_tlb_range(vma,
> > +					range.start, range.end);
> > +			mmu_notifier_invalidate_range(mm,
> > +					range.start, range.end);
> > +			/*
> > +			 * The ref count of the PMD page was
> > +			 * dropped which is part of the way map
> > +			 * counting is done for shared PMDs.
> > +			 * Return 'true' here.  When there is
> > +			 * no other sharing, huge_pmd_unshare
> > +			 * returns false and we will unmap the
> > +			 * actual page and drop map count
> > +			 * to zero.
> > +			 */
> > +			goto out;
> > +		}
> > +		hugetlb_vma_unlock_write(vma);
> > +	}
> > +	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> > +
> > +	/*
> > +	 * Now the pte is cleared. If this pte was uffd-wp armed,
> > +	 * we may want to replace a none pte with a marker pte if
> > +	 * it's file-backed, so we don't lose the tracking info.
> > +	 */
> > +	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
> > +
> > +	/* Set the dirty flag on the folio now the pte is gone. */
> > +	if (pte_dirty(pteval))
> > +		folio_mark_dirty(folio);
> > +
> > +	/* Update high watermark before we lower rss */
> > +	update_hiwater_rss(mm);
> > +
> > +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {

I'm guessing this can just be,
	if (!(flags & TTU_IGNORE_HWPOISON))

as we only get here in the poison case as indicated by,
	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio); 
above.

> > +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
> > +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
> > +	}
> > +
> > +	/*** try_to_unmap_one() called dec_mm_counter for
> > +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
> > +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
> > +	 */
> > +	hugetlb_count_sub(folio_nr_pages(folio), mm);
> > +
> > +	/*
> > +	 * No need to call mmu_notifier_invalidate_range() it has be
> > +	 * done above for all cases requiring it to happen under page
> > +	 * table lock before mmu_notifier_invalidate_range_end()
> > +	 *
> > +	 * See Documentation/mm/mmu_notifier.rst
> > +	 */
> > +	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));

Always compound/hugetlb, so could be:

	page_remove_rmap(&folio->page, vma, true);

> > +	if (vma->vm_flags & VM_LOCKED)
> > +		mlock_drain_local();

Since hugetlb pages are always memory resident, I do not think the above
is necessary.

> > +	folio_put(folio);
> > +
> > +out:
> > +	return ret;
> > +}
Yin, Fengwei Feb. 24, 2023, 12:52 a.m. UTC | #3
On 2/24/2023 8:20 AM, Mike Kravetz wrote:
> On 02/23/23 17:28, Matthew Wilcox wrote:
>> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
>>> It's to prepare the batched rmap update for large folio.
>>> No need to looped handle hugetlb. Just handle hugetlb and
>>> bail out early.
>>>
>>> Almost no functional change. Just one change to mm counter
>>> update.
>>
>> This looks like a very worthwhile cleanup in its own right.  Adding
>> various hugetlb & memory poisoning experts for commentary on the mm
>> counter change (search for three asterisks)
> 
> The counter change looks correct to me.
> However, there is a bunch of code in the else cases for
> 
> 	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON))
> 
> in try_to_unmap_one.  Are you sure none of it applies to hugetlb pages?
> It seems like some of the code in the
> 
> 	} else if (folio_test_anon(folio)) {
> 
> could apply.  pte_uffd_wp perhaps?
In the "else if (pte_unused).." and  "else if (folio_test_anon(folio)"
path, it uses set_pte_at or PAGE_SIZE for mmu_notifier_invalidate_range().
So I suppose hugetlb will not hit these two branches.

> 
> Since you are cleaning up, I think the following make apply ...
> 
>>> +static bool try_to_unmap_one_hugetlb(struct folio *folio,
>>> +		struct vm_area_struct *vma, struct mmu_notifier_range range,
>>> +		struct page_vma_mapped_walk pvmw, unsigned long address,
>>> +		enum ttu_flags flags)
>>> +{
>>> +	struct mm_struct *mm = vma->vm_mm;
>>> +	pte_t pteval;
>>> +	bool ret = true, anon = folio_test_anon(folio);
>>> +
>>> +	/*
>>> +	 * The try_to_unmap() is only passed a hugetlb page
>>> +	 * in the case where the hugetlb page is poisoned.
>>> +	 */
>>> +	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
>>> +	/*
>>> +	 * huge_pmd_unshare may unmap an entire PMD page.
>>> +	 * There is no way of knowing exactly which PMDs may
>>> +	 * be cached for this mm, so we must flush them all.
>>> +	 * start/end were already adjusted above to cover this
>>> +	 * range.
>>> +	 */
>>> +	flush_cache_range(vma, range.start, range.end);
>>> +
>>> +	/*
>>> +	 * To call huge_pmd_unshare, i_mmap_rwsem must be
>>> +	 * held in write mode.  Caller needs to explicitly
>>> +	 * do this outside rmap routines.
>>> +	 *
>>> +	 * We also must hold hugetlb vma_lock in write mode.
>>> +	 * Lock order dictates acquiring vma_lock BEFORE
>>> +	 * i_mmap_rwsem.  We can only try lock here and fail
>>> +	 * if unsuccessful.
>>> +	 */
>>> +	if (!anon) {
>>> +		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>> +			ret = false;
>>> +			goto out;
>>> +		}
>>> +		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
>>> +			hugetlb_vma_unlock_write(vma);
>>> +			flush_tlb_range(vma,
>>> +					range.start, range.end);
>>> +			mmu_notifier_invalidate_range(mm,
>>> +					range.start, range.end);
>>> +			/*
>>> +			 * The ref count of the PMD page was
>>> +			 * dropped which is part of the way map
>>> +			 * counting is done for shared PMDs.
>>> +			 * Return 'true' here.  When there is
>>> +			 * no other sharing, huge_pmd_unshare
>>> +			 * returns false and we will unmap the
>>> +			 * actual page and drop map count
>>> +			 * to zero.
>>> +			 */
>>> +			goto out;
>>> +		}
>>> +		hugetlb_vma_unlock_write(vma);
>>> +	}
>>> +	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
>>> +
>>> +	/*
>>> +	 * Now the pte is cleared. If this pte was uffd-wp armed,
>>> +	 * we may want to replace a none pte with a marker pte if
>>> +	 * it's file-backed, so we don't lose the tracking info.
>>> +	 */
>>> +	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
>>> +
>>> +	/* Set the dirty flag on the folio now the pte is gone. */
>>> +	if (pte_dirty(pteval))
>>> +		folio_mark_dirty(folio);
>>> +
>>> +	/* Update high watermark before we lower rss */
>>> +	update_hiwater_rss(mm);
>>> +
>>> +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
> 
> I'm guessing this can just be,
> 	if (!(flags & TTU_IGNORE_HWPOISON))
> 
> as we only get here in the poison case as indicated by,
> 	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio); 
> above.
You are correct. Will remove the folio_test_hwpoison().

> 
>>> +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
>>> +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
>>> +	}
>>> +
>>> +	/*** try_to_unmap_one() called dec_mm_counter for
>>> +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
>>> +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
>>> +	 */
>>> +	hugetlb_count_sub(folio_nr_pages(folio), mm);
>>> +
>>> +	/*
>>> +	 * No need to call mmu_notifier_invalidate_range() it has be
>>> +	 * done above for all cases requiring it to happen under page
>>> +	 * table lock before mmu_notifier_invalidate_range_end()
>>> +	 *
>>> +	 * See Documentation/mm/mmu_notifier.rst
>>> +	 */
>>> +	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));
> 
> Always compound/hugetlb, so could be:
> 
> 	page_remove_rmap(&folio->page, vma, true);
This is in the patch 3 of this series.

> 
>>> +	if (vma->vm_flags & VM_LOCKED)
>>> +		mlock_drain_local();
> 
> Since hugetlb pages are always memory resident, I do not think the above
> is necessary.
OK. Will remove this piece of code.

Thanks a lot for the comments.

Regards
Yin, Fengwei

> 
>>> +	folio_put(folio);
>>> +
>>> +out:
>>> +	return ret;
>>> +}
>
HORIGUCHI NAOYA(堀口 直也) Feb. 24, 2023, 2:51 a.m. UTC | #4
On Thu, Feb 23, 2023 at 05:28:10PM +0000, Matthew Wilcox wrote:
> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
> > It's to prepare the batched rmap update for large folio.
> > No need to looped handle hugetlb. Just handle hugetlb and
> > bail out early.
> > 
> > Almost no functional change. Just one change to mm counter
> > update.
> 
> This looks like a very worthwhile cleanup in its own right.  Adding
> various hugetlb & memory poisoning experts for commentary on the mm
> counter change (search for three asterisks)
> 
> > Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> > ---
> >  mm/rmap.c | 205 +++++++++++++++++++++++++++++++++---------------------
> >  1 file changed, 126 insertions(+), 79 deletions(-)
> > 
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 15ae24585fc4..e7aa63b800f7 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1443,6 +1443,108 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
> >  	munlock_vma_folio(folio, vma, compound);
> >  }
> >  
> > +static bool try_to_unmap_one_hugetlb(struct folio *folio,
> > +		struct vm_area_struct *vma, struct mmu_notifier_range range,
> > +		struct page_vma_mapped_walk pvmw, unsigned long address,
> > +		enum ttu_flags flags)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	pte_t pteval;
> > +	bool ret = true, anon = folio_test_anon(folio);
> > +
> > +	/*
> > +	 * The try_to_unmap() is only passed a hugetlb page
> > +	 * in the case where the hugetlb page is poisoned.
> > +	 */
> > +	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
> > +	/*
> > +	 * huge_pmd_unshare may unmap an entire PMD page.
> > +	 * There is no way of knowing exactly which PMDs may
> > +	 * be cached for this mm, so we must flush them all.
> > +	 * start/end were already adjusted above to cover this
> > +	 * range.
> > +	 */
> > +	flush_cache_range(vma, range.start, range.end);
> > +
> > +	/*
> > +	 * To call huge_pmd_unshare, i_mmap_rwsem must be
> > +	 * held in write mode.  Caller needs to explicitly
> > +	 * do this outside rmap routines.
> > +	 *
> > +	 * We also must hold hugetlb vma_lock in write mode.
> > +	 * Lock order dictates acquiring vma_lock BEFORE
> > +	 * i_mmap_rwsem.  We can only try lock here and fail
> > +	 * if unsuccessful.
> > +	 */
> > +	if (!anon) {
> > +		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> > +		if (!hugetlb_vma_trylock_write(vma)) {
> > +			ret = false;
> > +			goto out;
> > +		}
> > +		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> > +			hugetlb_vma_unlock_write(vma);
> > +			flush_tlb_range(vma,
> > +					range.start, range.end);
> > +			mmu_notifier_invalidate_range(mm,
> > +					range.start, range.end);
> > +			/*
> > +			 * The ref count of the PMD page was
> > +			 * dropped which is part of the way map
> > +			 * counting is done for shared PMDs.
> > +			 * Return 'true' here.  When there is
> > +			 * no other sharing, huge_pmd_unshare
> > +			 * returns false and we will unmap the
> > +			 * actual page and drop map count
> > +			 * to zero.
> > +			 */
> > +			goto out;
> > +		}
> > +		hugetlb_vma_unlock_write(vma);
> > +	}
> > +	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> > +
> > +	/*
> > +	 * Now the pte is cleared. If this pte was uffd-wp armed,
> > +	 * we may want to replace a none pte with a marker pte if
> > +	 * it's file-backed, so we don't lose the tracking info.
> > +	 */
> > +	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
> > +
> > +	/* Set the dirty flag on the folio now the pte is gone. */
> > +	if (pte_dirty(pteval))
> > +		folio_mark_dirty(folio);
> > +
> > +	/* Update high watermark before we lower rss */
> > +	update_hiwater_rss(mm);
> > +
> > +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
> > +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
> > +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
> > +	}
> > +
> > +	/*** try_to_unmap_one() called dec_mm_counter for
> > +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
> > +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
> > +	 */
> > +	hugetlb_count_sub(folio_nr_pages(folio), mm);

I have no objection to this change (moving hugetlb_count_sub() outside the
if), but I have a question related to this.

Generally TTU_IGNORE_HWPOISON is used to control the pte-conversion based
on page dirtiness.  But actually what it depends on is whether data lost
happens when the page is forcibly dropped.  And for hugetlb pages, that's
true regardless of PageDirty flag on it.
So I think we can assume that try_to_unmap_one_hugetlb() is called with
TTU_IGNORE_HWPOISON clear.  So maybe we don't need the if-check?

Otherwise, the cleanup looks good to me.

Thanks,
Naoya Horiguchi

> > +
> > +	/*
> > +	 * No need to call mmu_notifier_invalidate_range() it has be
> > +	 * done above for all cases requiring it to happen under page
> > +	 * table lock before mmu_notifier_invalidate_range_end()
> > +	 *
> > +	 * See Documentation/mm/mmu_notifier.rst
> > +	 */
> > +	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));
> > +	if (vma->vm_flags & VM_LOCKED)
> > +		mlock_drain_local();
> > +	folio_put(folio);
> > +
> > +out:
> > +	return ret;
> > +}
> > +
> >  /*
> >   * @arg: enum ttu_flags will be passed to this argument
> >   */
> > @@ -1506,86 +1608,37 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >  			break;
> >  		}
> >  
> > +		address = pvmw.address;
> > +		if (folio_test_hugetlb(folio)) {
> > +			ret = try_to_unmap_one_hugetlb(folio, vma, range,
> > +							pvmw, address, flags);
> > +
> > +			/* no need to loop for hugetlb */
> > +			page_vma_mapped_walk_done(&pvmw);
> > +			break;
> > +		}
> > +
> >  		subpage = folio_page(folio,
> >  					pte_pfn(*pvmw.pte) - folio_pfn(folio));
> > -		address = pvmw.address;
> >  		anon_exclusive = folio_test_anon(folio) &&
> >  				 PageAnonExclusive(subpage);
> >  
> > -		if (folio_test_hugetlb(folio)) {
> > -			bool anon = folio_test_anon(folio);
> > -
> > +		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
> > +		/* Nuke the page table entry. */
> > +		if (should_defer_flush(mm, flags)) {
> >  			/*
> > -			 * The try_to_unmap() is only passed a hugetlb page
> > -			 * in the case where the hugetlb page is poisoned.
> > +			 * We clear the PTE but do not flush so potentially
> > +			 * a remote CPU could still be writing to the folio.
> > +			 * If the entry was previously clean then the
> > +			 * architecture must guarantee that a clear->dirty
> > +			 * transition on a cached TLB entry is written through
> > +			 * and traps if the PTE is unmapped.
> >  			 */
> > -			VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
> > -			/*
> > -			 * huge_pmd_unshare may unmap an entire PMD page.
> > -			 * There is no way of knowing exactly which PMDs may
> > -			 * be cached for this mm, so we must flush them all.
> > -			 * start/end were already adjusted above to cover this
> > -			 * range.
> > -			 */
> > -			flush_cache_range(vma, range.start, range.end);
> > +			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
> >  
> > -			/*
> > -			 * To call huge_pmd_unshare, i_mmap_rwsem must be
> > -			 * held in write mode.  Caller needs to explicitly
> > -			 * do this outside rmap routines.
> > -			 *
> > -			 * We also must hold hugetlb vma_lock in write mode.
> > -			 * Lock order dictates acquiring vma_lock BEFORE
> > -			 * i_mmap_rwsem.  We can only try lock here and fail
> > -			 * if unsuccessful.
> > -			 */
> > -			if (!anon) {
> > -				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> > -				if (!hugetlb_vma_trylock_write(vma)) {
> > -					page_vma_mapped_walk_done(&pvmw);
> > -					ret = false;
> > -					break;
> > -				}
> > -				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> > -					hugetlb_vma_unlock_write(vma);
> > -					flush_tlb_range(vma,
> > -						range.start, range.end);
> > -					mmu_notifier_invalidate_range(mm,
> > -						range.start, range.end);
> > -					/*
> > -					 * The ref count of the PMD page was
> > -					 * dropped which is part of the way map
> > -					 * counting is done for shared PMDs.
> > -					 * Return 'true' here.  When there is
> > -					 * no other sharing, huge_pmd_unshare
> > -					 * returns false and we will unmap the
> > -					 * actual page and drop map count
> > -					 * to zero.
> > -					 */
> > -					page_vma_mapped_walk_done(&pvmw);
> > -					break;
> > -				}
> > -				hugetlb_vma_unlock_write(vma);
> > -			}
> > -			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> > +			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
> >  		} else {
> > -			flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
> > -			/* Nuke the page table entry. */
> > -			if (should_defer_flush(mm, flags)) {
> > -				/*
> > -				 * We clear the PTE but do not flush so potentially
> > -				 * a remote CPU could still be writing to the folio.
> > -				 * If the entry was previously clean then the
> > -				 * architecture must guarantee that a clear->dirty
> > -				 * transition on a cached TLB entry is written through
> > -				 * and traps if the PTE is unmapped.
> > -				 */
> > -				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
> > -
> > -				set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
> > -			} else {
> > -				pteval = ptep_clear_flush(vma, address, pvmw.pte);
> > -			}
> > +			pteval = ptep_clear_flush(vma, address, pvmw.pte);
> >  		}
> >  
> >  		/*
> > @@ -1604,14 +1657,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >  
> >  		if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
> >  			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
> > -			if (folio_test_hugetlb(folio)) {
> > -				hugetlb_count_sub(folio_nr_pages(folio), mm);
> > -				set_huge_pte_at(mm, address, pvmw.pte, pteval);
> > -			} else {
> > -				dec_mm_counter(mm, mm_counter(&folio->page));
> > -				set_pte_at(mm, address, pvmw.pte, pteval);
> > -			}
> > -
> > +			dec_mm_counter(mm, mm_counter(&folio->page));
> > +			set_pte_at(mm, address, pvmw.pte, pteval);
> >  		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
> >  			/*
> >  			 * The guest indicated that the page content is of no
> > -- 
> > 2.30.2
> >
Yin, Fengwei Feb. 24, 2023, 4:41 a.m. UTC | #5
On 2/24/2023 10:51 AM, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Feb 23, 2023 at 05:28:10PM +0000, Matthew Wilcox wrote:
>> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
>>> It's to prepare the batched rmap update for large folio.
>>> No need to looped handle hugetlb. Just handle hugetlb and
>>> bail out early.
>>>
>>> Almost no functional change. Just one change to mm counter
>>> update.
>>
>> This looks like a very worthwhile cleanup in its own right.  Adding
>> various hugetlb & memory poisoning experts for commentary on the mm
>> counter change (search for three asterisks)
>>
>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>> ---
>>>  mm/rmap.c | 205 +++++++++++++++++++++++++++++++++---------------------
>>>  1 file changed, 126 insertions(+), 79 deletions(-)
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 15ae24585fc4..e7aa63b800f7 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1443,6 +1443,108 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
>>>  	munlock_vma_folio(folio, vma, compound);
>>>  }
>>>  
>>> +static bool try_to_unmap_one_hugetlb(struct folio *folio,
>>> +		struct vm_area_struct *vma, struct mmu_notifier_range range,
>>> +		struct page_vma_mapped_walk pvmw, unsigned long address,
>>> +		enum ttu_flags flags)
>>> +{
>>> +	struct mm_struct *mm = vma->vm_mm;
>>> +	pte_t pteval;
>>> +	bool ret = true, anon = folio_test_anon(folio);
>>> +
>>> +	/*
>>> +	 * The try_to_unmap() is only passed a hugetlb page
>>> +	 * in the case where the hugetlb page is poisoned.
>>> +	 */
>>> +	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
>>> +	/*
>>> +	 * huge_pmd_unshare may unmap an entire PMD page.
>>> +	 * There is no way of knowing exactly which PMDs may
>>> +	 * be cached for this mm, so we must flush them all.
>>> +	 * start/end were already adjusted above to cover this
>>> +	 * range.
>>> +	 */
>>> +	flush_cache_range(vma, range.start, range.end);
>>> +
>>> +	/*
>>> +	 * To call huge_pmd_unshare, i_mmap_rwsem must be
>>> +	 * held in write mode.  Caller needs to explicitly
>>> +	 * do this outside rmap routines.
>>> +	 *
>>> +	 * We also must hold hugetlb vma_lock in write mode.
>>> +	 * Lock order dictates acquiring vma_lock BEFORE
>>> +	 * i_mmap_rwsem.  We can only try lock here and fail
>>> +	 * if unsuccessful.
>>> +	 */
>>> +	if (!anon) {
>>> +		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>> +			ret = false;
>>> +			goto out;
>>> +		}
>>> +		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
>>> +			hugetlb_vma_unlock_write(vma);
>>> +			flush_tlb_range(vma,
>>> +					range.start, range.end);
>>> +			mmu_notifier_invalidate_range(mm,
>>> +					range.start, range.end);
>>> +			/*
>>> +			 * The ref count of the PMD page was
>>> +			 * dropped which is part of the way map
>>> +			 * counting is done for shared PMDs.
>>> +			 * Return 'true' here.  When there is
>>> +			 * no other sharing, huge_pmd_unshare
>>> +			 * returns false and we will unmap the
>>> +			 * actual page and drop map count
>>> +			 * to zero.
>>> +			 */
>>> +			goto out;
>>> +		}
>>> +		hugetlb_vma_unlock_write(vma);
>>> +	}
>>> +	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
>>> +
>>> +	/*
>>> +	 * Now the pte is cleared. If this pte was uffd-wp armed,
>>> +	 * we may want to replace a none pte with a marker pte if
>>> +	 * it's file-backed, so we don't lose the tracking info.
>>> +	 */
>>> +	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
>>> +
>>> +	/* Set the dirty flag on the folio now the pte is gone. */
>>> +	if (pte_dirty(pteval))
>>> +		folio_mark_dirty(folio);
>>> +
>>> +	/* Update high watermark before we lower rss */
>>> +	update_hiwater_rss(mm);
>>> +
>>> +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
>>> +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
>>> +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
>>> +	}
>>> +
>>> +	/*** try_to_unmap_one() called dec_mm_counter for
>>> +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
>>> +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
>>> +	 */
>>> +	hugetlb_count_sub(folio_nr_pages(folio), mm);
> 
> I have no objection to this change (moving hugetlb_count_sub() outside the
> if), but I have a question related to this.
> 
> Generally TTU_IGNORE_HWPOISON is used to control the pte-conversion based
> on page dirtiness.  But actually what it depends on is whether data lost
> happens when the page is forcibly dropped.  And for hugetlb pages, that's
> true regardless of PageDirty flag on it.
> So I think we can assume that try_to_unmap_one_hugetlb() is called with
> TTU_IGNORE_HWPOISON clear.  So maybe we don't need the if-check?
Thanks a lot for detail explaination. I will remove the if check if no
object from other reviewer.


Regards
Yin, Fengwei

> 
> Otherwise, the cleanup looks good to me.
> 
> Thanks,
> Naoya Horiguchi
> 
>>> +
>>> +	/*
>>> +	 * No need to call mmu_notifier_invalidate_range() it has be
>>> +	 * done above for all cases requiring it to happen under page
>>> +	 * table lock before mmu_notifier_invalidate_range_end()
>>> +	 *
>>> +	 * See Documentation/mm/mmu_notifier.rst
>>> +	 */
>>> +	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));
>>> +	if (vma->vm_flags & VM_LOCKED)
>>> +		mlock_drain_local();
>>> +	folio_put(folio);
>>> +
>>> +out:
>>> +	return ret;
>>> +}
>>> +
>>>  /*
>>>   * @arg: enum ttu_flags will be passed to this argument
>>>   */
>>> @@ -1506,86 +1608,37 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>>  			break;
>>>  		}
>>>  
>>> +		address = pvmw.address;
>>> +		if (folio_test_hugetlb(folio)) {
>>> +			ret = try_to_unmap_one_hugetlb(folio, vma, range,
>>> +							pvmw, address, flags);
>>> +
>>> +			/* no need to loop for hugetlb */
>>> +			page_vma_mapped_walk_done(&pvmw);
>>> +			break;
>>> +		}
>>> +
>>>  		subpage = folio_page(folio,
>>>  					pte_pfn(*pvmw.pte) - folio_pfn(folio));
>>> -		address = pvmw.address;
>>>  		anon_exclusive = folio_test_anon(folio) &&
>>>  				 PageAnonExclusive(subpage);
>>>  
>>> -		if (folio_test_hugetlb(folio)) {
>>> -			bool anon = folio_test_anon(folio);
>>> -
>>> +		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
>>> +		/* Nuke the page table entry. */
>>> +		if (should_defer_flush(mm, flags)) {
>>>  			/*
>>> -			 * The try_to_unmap() is only passed a hugetlb page
>>> -			 * in the case where the hugetlb page is poisoned.
>>> +			 * We clear the PTE but do not flush so potentially
>>> +			 * a remote CPU could still be writing to the folio.
>>> +			 * If the entry was previously clean then the
>>> +			 * architecture must guarantee that a clear->dirty
>>> +			 * transition on a cached TLB entry is written through
>>> +			 * and traps if the PTE is unmapped.
>>>  			 */
>>> -			VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
>>> -			/*
>>> -			 * huge_pmd_unshare may unmap an entire PMD page.
>>> -			 * There is no way of knowing exactly which PMDs may
>>> -			 * be cached for this mm, so we must flush them all.
>>> -			 * start/end were already adjusted above to cover this
>>> -			 * range.
>>> -			 */
>>> -			flush_cache_range(vma, range.start, range.end);
>>> +			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
>>>  
>>> -			/*
>>> -			 * To call huge_pmd_unshare, i_mmap_rwsem must be
>>> -			 * held in write mode.  Caller needs to explicitly
>>> -			 * do this outside rmap routines.
>>> -			 *
>>> -			 * We also must hold hugetlb vma_lock in write mode.
>>> -			 * Lock order dictates acquiring vma_lock BEFORE
>>> -			 * i_mmap_rwsem.  We can only try lock here and fail
>>> -			 * if unsuccessful.
>>> -			 */
>>> -			if (!anon) {
>>> -				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
>>> -				if (!hugetlb_vma_trylock_write(vma)) {
>>> -					page_vma_mapped_walk_done(&pvmw);
>>> -					ret = false;
>>> -					break;
>>> -				}
>>> -				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
>>> -					hugetlb_vma_unlock_write(vma);
>>> -					flush_tlb_range(vma,
>>> -						range.start, range.end);
>>> -					mmu_notifier_invalidate_range(mm,
>>> -						range.start, range.end);
>>> -					/*
>>> -					 * The ref count of the PMD page was
>>> -					 * dropped which is part of the way map
>>> -					 * counting is done for shared PMDs.
>>> -					 * Return 'true' here.  When there is
>>> -					 * no other sharing, huge_pmd_unshare
>>> -					 * returns false and we will unmap the
>>> -					 * actual page and drop map count
>>> -					 * to zero.
>>> -					 */
>>> -					page_vma_mapped_walk_done(&pvmw);
>>> -					break;
>>> -				}
>>> -				hugetlb_vma_unlock_write(vma);
>>> -			}
>>> -			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
>>> +			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
>>>  		} else {
>>> -			flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
>>> -			/* Nuke the page table entry. */
>>> -			if (should_defer_flush(mm, flags)) {
>>> -				/*
>>> -				 * We clear the PTE but do not flush so potentially
>>> -				 * a remote CPU could still be writing to the folio.
>>> -				 * If the entry was previously clean then the
>>> -				 * architecture must guarantee that a clear->dirty
>>> -				 * transition on a cached TLB entry is written through
>>> -				 * and traps if the PTE is unmapped.
>>> -				 */
>>> -				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
>>> -
>>> -				set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
>>> -			} else {
>>> -				pteval = ptep_clear_flush(vma, address, pvmw.pte);
>>> -			}
>>> +			pteval = ptep_clear_flush(vma, address, pvmw.pte);
>>>  		}
>>>  
>>>  		/*
>>> @@ -1604,14 +1657,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>>  
>>>  		if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
>>>  			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
>>> -			if (folio_test_hugetlb(folio)) {
>>> -				hugetlb_count_sub(folio_nr_pages(folio), mm);
>>> -				set_huge_pte_at(mm, address, pvmw.pte, pteval);
>>> -			} else {
>>> -				dec_mm_counter(mm, mm_counter(&folio->page));
>>> -				set_pte_at(mm, address, pvmw.pte, pteval);
>>> -			}
>>> -
>>> +			dec_mm_counter(mm, mm_counter(&folio->page));
>>> +			set_pte_at(mm, address, pvmw.pte, pteval);
>>>  		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
>>>  			/*
>>>  			 * The guest indicated that the page content is of no
>>> -- 
>>> 2.30.2
Mike Kravetz Feb. 24, 2023, 7:21 p.m. UTC | #6
On 02/24/23 02:51, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Feb 23, 2023 at 05:28:10PM +0000, Matthew Wilcox wrote:
> > On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
> > > +
> > > +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
> > > +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
> > > +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
> > > +	}
> > > +
> > > +	/*** try_to_unmap_one() called dec_mm_counter for
> > > +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
> > > +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
> > > +	 */
> > > +	hugetlb_count_sub(folio_nr_pages(folio), mm);
> 
> I have no objection to this change (moving hugetlb_count_sub() outside the
> if), but I have a question related to this.
> 
> Generally TTU_IGNORE_HWPOISON is used to control the pte-conversion based
> on page dirtiness.  But actually what it depends on is whether data lost
> happens when the page is forcibly dropped.  And for hugetlb pages, that's
> true regardless of PageDirty flag on it.
> So I think we can assume that try_to_unmap_one_hugetlb() is called with
> TTU_IGNORE_HWPOISON clear.  So maybe we don't need the if-check?

Thanks for looking at this Naoya!

In another reply I asked about the possibility of that if statement ever
being false for hugetlb pages.  Looks like that can never happen.  I went
back and looked at the memory failure/poison code just to be sure.

Yin,
Since we NEVER took went down the (folio_test_hwpoison(folio) &&
!(flags & TTU_IGNORE_HWPOISON)) not true path in the current code,
perhaps we not need the comment possibly calling dec_mm_counter.
Yin, Fengwei Feb. 26, 2023, 11:44 a.m. UTC | #7
On 2/25/2023 3:21 AM, Mike Kravetz wrote:
> On 02/24/23 02:51, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Thu, Feb 23, 2023 at 05:28:10PM +0000, Matthew Wilcox wrote:
>>> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote:
>>>> +
>>>> +	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
>>>> +		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
>>>> +		set_huge_pte_at(mm, address, pvmw.pte, pteval);
>>>> +	}
>>>> +
>>>> +	/*** try_to_unmap_one() called dec_mm_counter for
>>>> +	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
>>>> +	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
>>>> +	 */
>>>> +	hugetlb_count_sub(folio_nr_pages(folio), mm);
>>
>> I have no objection to this change (moving hugetlb_count_sub() outside the
>> if), but I have a question related to this.
>>
>> Generally TTU_IGNORE_HWPOISON is used to control the pte-conversion based
>> on page dirtiness.  But actually what it depends on is whether data lost
>> happens when the page is forcibly dropped.  And for hugetlb pages, that's
>> true regardless of PageDirty flag on it.
>> So I think we can assume that try_to_unmap_one_hugetlb() is called with
>> TTU_IGNORE_HWPOISON clear.  So maybe we don't need the if-check?
> 
> Thanks for looking at this Naoya!
> 
> In another reply I asked about the possibility of that if statement ever
> being false for hugetlb pages.  Looks like that can never happen.  I went
> back and looked at the memory failure/poison code just to be sure.
> 
> Yin,
> Since we NEVER took went down the (folio_test_hwpoison(folio) &&
> !(flags & TTU_IGNORE_HWPOISON)) not true path in the current code,
> perhaps we not need the comment possibly calling dec_mm_counter. 
Sure. I am going to remove this line and the comment of mm counter 
also:
if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {

Will send out new version after few more days to see whether there
are comments to other patches in this series.

Thanks all for sharing the comments.


Regards
Yin, Fengwei
diff mbox series

Patch

diff --git a/mm/rmap.c b/mm/rmap.c
index 15ae24585fc4..e7aa63b800f7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1443,6 +1443,108 @@  void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 	munlock_vma_folio(folio, vma, compound);
 }
 
+static bool try_to_unmap_one_hugetlb(struct folio *folio,
+		struct vm_area_struct *vma, struct mmu_notifier_range range,
+		struct page_vma_mapped_walk pvmw, unsigned long address,
+		enum ttu_flags flags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t pteval;
+	bool ret = true, anon = folio_test_anon(folio);
+
+	/*
+	 * The try_to_unmap() is only passed a hugetlb page
+	 * in the case where the hugetlb page is poisoned.
+	 */
+	VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio);
+	/*
+	 * huge_pmd_unshare may unmap an entire PMD page.
+	 * There is no way of knowing exactly which PMDs may
+	 * be cached for this mm, so we must flush them all.
+	 * start/end were already adjusted above to cover this
+	 * range.
+	 */
+	flush_cache_range(vma, range.start, range.end);
+
+	/*
+	 * To call huge_pmd_unshare, i_mmap_rwsem must be
+	 * held in write mode.  Caller needs to explicitly
+	 * do this outside rmap routines.
+	 *
+	 * We also must hold hugetlb vma_lock in write mode.
+	 * Lock order dictates acquiring vma_lock BEFORE
+	 * i_mmap_rwsem.  We can only try lock here and fail
+	 * if unsuccessful.
+	 */
+	if (!anon) {
+		VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+		if (!hugetlb_vma_trylock_write(vma)) {
+			ret = false;
+			goto out;
+		}
+		if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+			hugetlb_vma_unlock_write(vma);
+			flush_tlb_range(vma,
+					range.start, range.end);
+			mmu_notifier_invalidate_range(mm,
+					range.start, range.end);
+			/*
+			 * The ref count of the PMD page was
+			 * dropped which is part of the way map
+			 * counting is done for shared PMDs.
+			 * Return 'true' here.  When there is
+			 * no other sharing, huge_pmd_unshare
+			 * returns false and we will unmap the
+			 * actual page and drop map count
+			 * to zero.
+			 */
+			goto out;
+		}
+		hugetlb_vma_unlock_write(vma);
+	}
+	pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+
+	/*
+	 * Now the pte is cleared. If this pte was uffd-wp armed,
+	 * we may want to replace a none pte with a marker pte if
+	 * it's file-backed, so we don't lose the tracking info.
+	 */
+	pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
+
+	/* Set the dirty flag on the folio now the pte is gone. */
+	if (pte_dirty(pteval))
+		folio_mark_dirty(folio);
+
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
+	if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) {
+		pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page));
+		set_huge_pte_at(mm, address, pvmw.pte, pteval);
+	}
+
+	/*** try_to_unmap_one() called dec_mm_counter for
+	 * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not
+	 * true case, looks incorrect. Change it to hugetlb_count_sub() here.
+	 */
+	hugetlb_count_sub(folio_nr_pages(folio), mm);
+
+	/*
+	 * No need to call mmu_notifier_invalidate_range() it has be
+	 * done above for all cases requiring it to happen under page
+	 * table lock before mmu_notifier_invalidate_range_end()
+	 *
+	 * See Documentation/mm/mmu_notifier.rst
+	 */
+	page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio));
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_drain_local();
+	folio_put(folio);
+
+out:
+	return ret;
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1506,86 +1608,37 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			break;
 		}
 
+		address = pvmw.address;
+		if (folio_test_hugetlb(folio)) {
+			ret = try_to_unmap_one_hugetlb(folio, vma, range,
+							pvmw, address, flags);
+
+			/* no need to loop for hugetlb */
+			page_vma_mapped_walk_done(&pvmw);
+			break;
+		}
+
 		subpage = folio_page(folio,
 					pte_pfn(*pvmw.pte) - folio_pfn(folio));
-		address = pvmw.address;
 		anon_exclusive = folio_test_anon(folio) &&
 				 PageAnonExclusive(subpage);
 
-		if (folio_test_hugetlb(folio)) {
-			bool anon = folio_test_anon(folio);
-
+		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+		/* Nuke the page table entry. */
+		if (should_defer_flush(mm, flags)) {
 			/*
-			 * The try_to_unmap() is only passed a hugetlb page
-			 * in the case where the hugetlb page is poisoned.
+			 * We clear the PTE but do not flush so potentially
+			 * a remote CPU could still be writing to the folio.
+			 * If the entry was previously clean then the
+			 * architecture must guarantee that a clear->dirty
+			 * transition on a cached TLB entry is written through
+			 * and traps if the PTE is unmapped.
 			 */
-			VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
-			/*
-			 * huge_pmd_unshare may unmap an entire PMD page.
-			 * There is no way of knowing exactly which PMDs may
-			 * be cached for this mm, so we must flush them all.
-			 * start/end were already adjusted above to cover this
-			 * range.
-			 */
-			flush_cache_range(vma, range.start, range.end);
+			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
 
-			/*
-			 * To call huge_pmd_unshare, i_mmap_rwsem must be
-			 * held in write mode.  Caller needs to explicitly
-			 * do this outside rmap routines.
-			 *
-			 * We also must hold hugetlb vma_lock in write mode.
-			 * Lock order dictates acquiring vma_lock BEFORE
-			 * i_mmap_rwsem.  We can only try lock here and fail
-			 * if unsuccessful.
-			 */
-			if (!anon) {
-				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-				if (!hugetlb_vma_trylock_write(vma)) {
-					page_vma_mapped_walk_done(&pvmw);
-					ret = false;
-					break;
-				}
-				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-					hugetlb_vma_unlock_write(vma);
-					flush_tlb_range(vma,
-						range.start, range.end);
-					mmu_notifier_invalidate_range(mm,
-						range.start, range.end);
-					/*
-					 * The ref count of the PMD page was
-					 * dropped which is part of the way map
-					 * counting is done for shared PMDs.
-					 * Return 'true' here.  When there is
-					 * no other sharing, huge_pmd_unshare
-					 * returns false and we will unmap the
-					 * actual page and drop map count
-					 * to zero.
-					 */
-					page_vma_mapped_walk_done(&pvmw);
-					break;
-				}
-				hugetlb_vma_unlock_write(vma);
-			}
-			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
 		} else {
-			flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
-			/* Nuke the page table entry. */
-			if (should_defer_flush(mm, flags)) {
-				/*
-				 * We clear the PTE but do not flush so potentially
-				 * a remote CPU could still be writing to the folio.
-				 * If the entry was previously clean then the
-				 * architecture must guarantee that a clear->dirty
-				 * transition on a cached TLB entry is written through
-				 * and traps if the PTE is unmapped.
-				 */
-				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
-
-				set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
-			} else {
-				pteval = ptep_clear_flush(vma, address, pvmw.pte);
-			}
+			pteval = ptep_clear_flush(vma, address, pvmw.pte);
 		}
 
 		/*
@@ -1604,14 +1657,8 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 
 		if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
-			if (folio_test_hugetlb(folio)) {
-				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_pte_at(mm, address, pvmw.pte, pteval);
-			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
-				set_pte_at(mm, address, pvmw.pte, pteval);
-			}
-
+			dec_mm_counter(mm, mm_counter(&folio->page));
+			set_pte_at(mm, address, pvmw.pte, pteval);
 		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
 			/*
 			 * The guest indicated that the page content is of no