thp: Simplify splitting PMD mapping huge zero page
diff mbox series

Message ID 20200327170353.17734-1-kirill.shutemov@linux.intel.com
State New
Headers show
Series
  • thp: Simplify splitting PMD mapping huge zero page
Related show

Commit Message

Kirill A. Shutemov March 27, 2020, 5:03 p.m. UTC
Splitting PMD mapping huge zero page can be simplified a lot: we can
just unmap it and fallback to PTE handling.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 57 ++++--------------------------------------------
 1 file changed, 4 insertions(+), 53 deletions(-)

Comments

Zi Yan March 27, 2020, 5:23 p.m. UTC | #1
On 27 Mar 2020, at 13:03, Kirill A. Shutemov wrote:

> Splitting PMD mapping huge zero page can be simplified a lot: we can
> just unmap it and fallback to PTE handling.

So we will have an extra page fault for the first read to each subpage, but nothing changes if the first access to a subpage is a write, right? BTW, what is the motivation for this code simplification?

Thanks.

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/huge_memory.c | 57 ++++--------------------------------------------
>  1 file changed, 4 insertions(+), 53 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 42407e16bd80..ef6a6bcb291f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
>  }
>  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>
> -static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
> -		unsigned long haddr, pmd_t *pmd)
> -{
> -	struct mm_struct *mm = vma->vm_mm;
> -	pgtable_t pgtable;
> -	pmd_t _pmd;
> -	int i;
> -
> -	/*
> -	 * Leave pmd empty until pte is filled note that it is fine to delay
> -	 * notification until mmu_notifier_invalidate_range_end() as we are
> -	 * replacing a zero pmd write protected page with a zero pte write
> -	 * protected page.
> -	 *
> -	 * See Documentation/vm/mmu_notifier.rst
> -	 */
> -	pmdp_huge_clear_flush(vma, haddr, pmd);
> -
> -	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
> -	pmd_populate(mm, &_pmd, pgtable);
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -		pte_t *pte, entry;
> -		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
> -		entry = pte_mkspecial(entry);
> -		pte = pte_offset_map(&_pmd, haddr);
> -		VM_BUG_ON(!pte_none(*pte));
> -		set_pte_at(mm, haddr, pte, entry);
> -		pte_unmap(pte);
> -	}
> -	smp_wmb(); /* make pte visible before pmd */
> -	pmd_populate(mm, pmd, pgtable);
> -}
> -
>  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		unsigned long haddr, bool freeze)
>  {
> @@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>
>  	count_vm_event(THP_SPLIT_PMD);
>
> -	if (!vma_is_anonymous(vma)) {
> +	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
>  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>  		/*
>  		 * We are going to unmap this huge page. So
> @@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		 */
>  		if (arch_needs_pgtable_deposit())
>  			zap_deposited_table(mm, pmd);
> -		if (vma_is_dax(vma))
> +		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
>  			return;
>  		page = pmd_page(_pmd);
>  		if (!PageDirty(page) && pmd_dirty(_pmd))
> @@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		put_page(page);
>  		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
>  		return;
> -	} else if (is_huge_zero_pmd(*pmd)) {
> -		/*
> -		 * FIXME: Do we want to invalidate secondary mmu by calling
> -		 * mmu_notifier_invalidate_range() see comments below inside
> -		 * __split_huge_pmd() ?
> -		 *
> -		 * We are going from a zero huge page write protected to zero
> -		 * small page also write protected so it does not seems useful
> -		 * to invalidate secondary mmu at this time.
> -		 */
> -		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>  	}
>
>  	/*
> @@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  	spin_unlock(ptl);
>  	/*
>  	 * No need to double call mmu_notifier->invalidate_range() callback.
> -	 * They are 3 cases to consider inside __split_huge_pmd_locked():
> +	 * They are 2 cases to consider inside __split_huge_pmd_locked():
>  	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
> -	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
> -	 *    fault will trigger a flush_notify before pointing to a new page
> -	 *    (it is fine if the secondary mmu keeps pointing to the old zero
> -	 *    page in the meantime)
> -	 *  3) Split a huge pmd into pte pointing to the same page. No need
> +	 *  2) Split a huge pmd into pte pointing to the same page. No need
>  	 *     to invalidate secondary tlb entry they are all still valid.
>  	 *     any further changes to individual pte will notify. So no need
>  	 *     to call mmu_notifier->invalidate_range()
> -- 
> 2.26.0


—
Best Regards,
Yan Zi
Kirill A. Shutemov March 28, 2020, 12:19 a.m. UTC | #2
On Fri, Mar 27, 2020 at 01:23:07PM -0400, Zi Yan wrote:
> On 27 Mar 2020, at 13:03, Kirill A. Shutemov wrote:
> 
> > Splitting PMD mapping huge zero page can be simplified a lot: we can
> > just unmap it and fallback to PTE handling.
> 
> So we will have an extra page fault for the first read to each subpage,
> but nothing changes if the first access to a subpage is a write, right?
> BTW, what is the motivation for this code simplification?

Match what we do for file-THP.

I found a problem with the patch. Ignore it for now.

Patch
diff mbox series

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42407e16bd80..ef6a6bcb291f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2114,40 +2114,6 @@  void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
-		unsigned long haddr, pmd_t *pmd)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-
-	/*
-	 * Leave pmd empty until pte is filled note that it is fine to delay
-	 * notification until mmu_notifier_invalidate_range_end() as we are
-	 * replacing a zero pmd write protected page with a zero pte write
-	 * protected page.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush(vma, haddr, pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-	pmd_populate(mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t *pte, entry;
-		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
-		entry = pte_mkspecial(entry);
-		pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*pte));
-		set_pte_at(mm, haddr, pte, entry);
-		pte_unmap(pte);
-	}
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(mm, pmd, pgtable);
-}
-
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long haddr, bool freeze)
 {
@@ -2167,7 +2133,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	count_vm_event(THP_SPLIT_PMD);
 
-	if (!vma_is_anonymous(vma)) {
+	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		/*
 		 * We are going to unmap this huge page. So
@@ -2175,7 +2141,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(mm, pmd);
-		if (vma_is_dax(vma))
+		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
 			return;
 		page = pmd_page(_pmd);
 		if (!PageDirty(page) && pmd_dirty(_pmd))
@@ -2186,17 +2152,6 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		put_page(page);
 		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
 		return;
-	} else if (is_huge_zero_pmd(*pmd)) {
-		/*
-		 * FIXME: Do we want to invalidate secondary mmu by calling
-		 * mmu_notifier_invalidate_range() see comments below inside
-		 * __split_huge_pmd() ?
-		 *
-		 * We are going from a zero huge page write protected to zero
-		 * small page also write protected so it does not seems useful
-		 * to invalidate secondary mmu at this time.
-		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
 	/*
@@ -2339,13 +2294,9 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	spin_unlock(ptl);
 	/*
 	 * No need to double call mmu_notifier->invalidate_range() callback.
-	 * They are 3 cases to consider inside __split_huge_pmd_locked():
+	 * They are 2 cases to consider inside __split_huge_pmd_locked():
 	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
-	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
-	 *    fault will trigger a flush_notify before pointing to a new page
-	 *    (it is fine if the secondary mmu keeps pointing to the old zero
-	 *    page in the meantime)
-	 *  3) Split a huge pmd into pte pointing to the same page. No need
+	 *  2) Split a huge pmd into pte pointing to the same page. No need
 	 *     to invalidate secondary tlb entry they are all still valid.
 	 *     any further changes to individual pte will notify. So no need
 	 *     to call mmu_notifier->invalidate_range()