diff mbox series

[v3,3/3] mm: optimise pte dirty/accessed bit setting by demand based pte insertion

Message ID 20201220045535.848591-4-npiggin@gmail.com (mailing list archive)
State New, archived
Headers show
Series mm: improve pte updates and dirty/accessed | expand

Commit Message

Nicholas Piggin Dec. 20, 2020, 4:55 a.m. UTC
Similarly to the previous patch, this tries to optimise dirty/accessed
bits in ptes to avoid access costs of hardware setting them.

This tidies up a few last cases where dirty/accessed faults can be seen,
and subsumes the pte_sw_mkyoung helper -- it's not just architectures
with explicit software dirty/accessed bits that take expensive faults to
modify ptes.

The vast majority of the remaining dirty/accessed faults on kbuild
workloads after this patch are from NUMA migration, due to
remove_migration_pte inserting old/clean ptes.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/mips/include/asm/pgtable.h |  2 --
 include/linux/pgtable.h         | 16 ----------------
 mm/huge_memory.c                |  4 ++--
 mm/memory.c                     | 14 +++++++-------
 mm/migrate.c                    |  1 +
 mm/shmem.c                      |  1 +
 mm/userfaultfd.c                |  2 +-
 7 files changed, 12 insertions(+), 28 deletions(-)

Comments

Hugh Dickins Dec. 21, 2020, 6:21 p.m. UTC | #1
Hi Nick,

On Sun, 20 Dec 2020, Nicholas Piggin wrote:

> Similarly to the previous patch, this tries to optimise dirty/accessed
> bits in ptes to avoid access costs of hardware setting them.
> 
> This tidies up a few last cases where dirty/accessed faults can be seen,
> and subsumes the pte_sw_mkyoung helper -- it's not just architectures
> with explicit software dirty/accessed bits that take expensive faults to
> modify ptes.
> 
> The vast majority of the remaining dirty/accessed faults on kbuild
> workloads after this patch are from NUMA migration, due to
> remove_migration_pte inserting old/clean ptes.

Are you sure about this patch? It looks wrong to me: because isn't
_PAGE_ACCESSED (young) already included in the vm_page_prot __S001 etc?

I haven't checked all instances below, but in general, that's why the
existing code tends not to bother to mkyoung, but does sometimes mkold
(admittedly confusing).

A quick check on x86 and powerpc looks like they do have _PAGE_ACCESSED
in there; and IIRC that's true, or ought to be true, of all architectures.

Maybe not mips, which I see you have singled out: I remember going round
this loop a few months ago with mips, where strange changes were being
proposed to compensate for not having that bit in their vm_page_prot.
I didn't follow through to see how that ended up, but I did suggest
mips needed to follow the same convention as the other architectures.

Hugh

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/mips/include/asm/pgtable.h |  2 --
>  include/linux/pgtable.h         | 16 ----------------
>  mm/huge_memory.c                |  4 ++--
>  mm/memory.c                     | 14 +++++++-------
>  mm/migrate.c                    |  1 +
>  mm/shmem.c                      |  1 +
>  mm/userfaultfd.c                |  2 +-
>  7 files changed, 12 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 4f9c37616d42..3275495adccb 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -406,8 +406,6 @@ static inline pte_t pte_mkyoung(pte_t pte)
>  	return pte;
>  }
>  
> -#define pte_sw_mkyoung	pte_mkyoung
> -
>  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
>  static inline int pte_huge(pte_t pte)	{ return pte_val(pte) & _PAGE_HUGE; }
>  
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 8fcdfa52eb4b..70d04931dff4 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -424,22 +424,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
>  }
>  #endif
>  
> -/*
> - * On some architectures hardware does not set page access bit when accessing
> - * memory page, it is responsibilty of software setting this bit. It brings
> - * out extra page fault penalty to track page access bit. For optimization page
> - * access bit can be set during all page fault flow on these arches.
> - * To be differentiate with macro pte_mkyoung, this macro is used on platforms
> - * where software maintains page access bit.
> - */
> -#ifndef pte_sw_mkyoung
> -static inline pte_t pte_sw_mkyoung(pte_t pte)
> -{
> -	return pte;
> -}
> -#define pte_sw_mkyoung	pte_sw_mkyoung
> -#endif
> -
>  #ifndef pte_savedwrite
>  #define pte_savedwrite pte_write
>  #endif
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2ca0326b5af..f6719312dc27 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2151,8 +2151,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  			entry = maybe_mkwrite(entry, vma);
>  			if (!write)
>  				entry = pte_wrprotect(entry);
> -			if (!young)
> -				entry = pte_mkold(entry);
> +			if (young)
> +				entry = pte_mkyoung(entry);
>  			if (soft_dirty)
>  				entry = pte_mksoft_dirty(entry);
>  			if (uffd_wp)
> diff --git a/mm/memory.c b/mm/memory.c
> index dd1f364d8ca3..4cebba596660 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1639,7 +1639,7 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
>  	get_page(page);
>  	inc_mm_counter_fast(mm, mm_counter_file(page));
>  	page_add_file_rmap(page, false);
> -	set_pte_at(mm, addr, pte, mk_pte(page, prot));
> +	set_pte_at(mm, addr, pte, pte_mkyoung(mk_pte(page, prot)));
>  	return 0;
>  }
>  
> @@ -1954,10 +1954,9 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  	else
>  		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
>  
> -	if (mkwrite) {
> -		entry = pte_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
> +	if (mkwrite)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -	}
>  
>  	set_pte_at(mm, addr, pte, entry);
>  	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
> @@ -2889,7 +2888,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>  		}
>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
> -		entry = pte_sw_mkyoung(entry);
> +		entry = pte_mkyoung(entry);
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		/*
>  		 * Clear the pte entry and flush it first, before updating the
> @@ -3402,6 +3401,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>  	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
>  	pte = mk_pte(page, vma->vm_page_prot);
> +	pte = pte_mkyoung(pte);
>  	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
>  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>  		vmf->flags &= ~FAULT_FLAG_WRITE;
> @@ -3545,7 +3545,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	__SetPageUptodate(page);
>  
>  	entry = mk_pte(page, vma->vm_page_prot);
> -	entry = pte_sw_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
>  	if (vma->vm_flags & VM_WRITE)
>  		entry = pte_mkwrite(pte_mkdirty(entry));
>  
> @@ -3821,7 +3821,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
>  
>  	flush_icache_page(vma, page);
>  	entry = mk_pte(page, vma->vm_page_prot);
> -	entry = pte_sw_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
>  	if (write)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  	/* copy-on-write page */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index ee5e612b4cd8..d33b2bfc846b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2963,6 +2963,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
>  		}
>  	} else {
>  		entry = mk_pte(page, vma->vm_page_prot);
> +		entry = pte_mkyoung(entry);
>  		if (vma->vm_flags & VM_WRITE)
>  			entry = pte_mkwrite(pte_mkdirty(entry));
>  	}
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7c6b6d8f6c39..4f23b16d6baf 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2420,6 +2420,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>  		goto out_release;
>  
>  	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> +	_dst_pte = pte_mkyoung(_dst_pte);
>  	if (dst_vma->vm_flags & VM_WRITE)
>  		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
>  	else {
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 9a3d451402d7..56c44aa06a7e 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -99,7 +99,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
>  		goto out_release;
>  
> -	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
> +	_dst_pte = pte_mkdirty(pte_mkyoung(mk_pte(page, dst_vma->vm_page_prot)));
>  	if (dst_vma->vm_flags & VM_WRITE) {
>  		if (wp_copy)
>  			_dst_pte = pte_mkuffd_wp(_dst_pte);
> -- 
> 2.23.0
Nicholas Piggin Dec. 22, 2020, 3:24 a.m. UTC | #2
Excerpts from Hugh Dickins's message of December 22, 2020 4:21 am:
> Hi Nick,
> 
> On Sun, 20 Dec 2020, Nicholas Piggin wrote:
> 
>> Similarly to the previous patch, this tries to optimise dirty/accessed
>> bits in ptes to avoid access costs of hardware setting them.
>> 
>> This tidies up a few last cases where dirty/accessed faults can be seen,
>> and subsumes the pte_sw_mkyoung helper -- it's not just architectures
>> with explicit software dirty/accessed bits that take expensive faults to
>> modify ptes.
>> 
>> The vast majority of the remaining dirty/accessed faults on kbuild
>> workloads after this patch are from NUMA migration, due to
>> remove_migration_pte inserting old/clean ptes.
> 
> Are you sure about this patch? It looks wrong to me: because isn't
> _PAGE_ACCESSED (young) already included in the vm_page_prot __S001 etc?
> 
> I haven't checked all instances below, but in general, that's why the
> existing code tends not to bother to mkyoung, but does sometimes mkold
> (admittedly confusing).

There might have been one or two cases where it didn't come directly
from vm_page_prot, but it was a few rebases and updates ago. I did see
one or two places where powerpc was taking faults. Good point though I 
can test again and see, and I might split the patch.

> 
> A quick check on x86 and powerpc looks like they do have _PAGE_ACCESSED
> in there; and IIRC that's true, or ought to be true, of all architectures.
> 
> Maybe not mips, which I see you have singled out: I remember going round
> this loop a few months ago with mips, where strange changes were being
> proposed to compensate for not having that bit in their vm_page_prot.
> I didn't follow through to see how that ended up, but I did suggest
> mips needed to follow the same convention as the other architectures.

Yeah the main thing is to try get all architectures doing the same thing
and get rid of that sw_mkyoung too. Given the (Intel) x86 result of the
heavy micro-fault, I don't think anybody is special and we should 
require them to follow the same behaviour unless it's proven that one
needs something different.

If we can get all arch to set accessed in vm_page_prot and get rid of 
most of these mkyoung()s then all the better. And it actually looks like
MIPS may be changing direction:

https://lore.kernel.org/linux-arch/20200919074731.22372-1-huangpei@loongson.cn/

What's the chances of that going upstream for the next merge window? It
seems like the right thing to do.

Thanks,
Nick
Huang Pei Dec. 23, 2020, 12:56 a.m. UTC | #3
Hi, 
On Tue, Dec 22, 2020 at 01:24:53PM +1000, Nicholas Piggin wrote:
> Excerpts from Hugh Dickins's message of December 22, 2020 4:21 am:
> > Hi Nick,
> > 
> > On Sun, 20 Dec 2020, Nicholas Piggin wrote:
> > 
> >> Similarly to the previous patch, this tries to optimise dirty/accessed
> >> bits in ptes to avoid access costs of hardware setting them.
> >> 
> >> This tidies up a few last cases where dirty/accessed faults can be seen,
> >> and subsumes the pte_sw_mkyoung helper -- it's not just architectures
> >> with explicit software dirty/accessed bits that take expensive faults to
> >> modify ptes.
> >> 
> >> The vast majority of the remaining dirty/accessed faults on kbuild
> >> workloads after this patch are from NUMA migration, due to
> >> remove_migration_pte inserting old/clean ptes.
> > 
> > Are you sure about this patch? It looks wrong to me: because isn't
> > _PAGE_ACCESSED (young) already included in the vm_page_prot __S001 etc?
> > 
> > I haven't checked all instances below, but in general, that's why the
> > existing code tends not to bother to mkyoung, but does sometimes mkold
> > (admittedly confusing).
> 
> There might have been one or two cases where it didn't come directly
> from vm_page_prot, but it was a few rebases and updates ago. I did see
> one or two places where powerpc was taking faults. Good point though I 
> can test again and see, and I might split the patch.
> 
> > 
> > A quick check on x86 and powerpc looks like they do have _PAGE_ACCESSED
> > in there; and IIRC that's true, or ought to be true, of all architectures.
> > 
> > Maybe not mips, which I see you have singled out: I remember going round
> > this loop a few months ago with mips, where strange changes were being
> > proposed to compensate for not having that bit in their vm_page_prot.
> > I didn't follow through to see how that ended up, but I did suggest
> > mips needed to follow the same convention as the other architectures.
> 
> Yeah the main thing is to try get all architectures doing the same thing
> and get rid of that sw_mkyoung too. Given the (Intel) x86 result of the
> heavy micro-fault, I don't think anybody is special and we should 
> require them to follow the same behaviour unless it's proven that one
> needs something different.
> 
> If we can get all arch to set accessed in vm_page_prot and get rid of 
> most of these mkyoung()s then all the better. And it actually looks like
> MIPS may be changing direction:
> 
> https://lore.kernel.org/linux-arch/20200919074731.22372-1-huangpei@loongson.cn/
> 
> What's the chances of that going upstream for the next merge window? It
> seems like the right thing to do.

Any comment on V4:

https://lore.kernel.org/linux-arch/20201019081257.32127-1-huangpei@loongson.cn/

> 
> Thanks,
> Nick

Thanks,
Huang Pei
diff mbox series

Patch

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4f9c37616d42..3275495adccb 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -406,8 +406,6 @@  static inline pte_t pte_mkyoung(pte_t pte)
 	return pte;
 }
 
-#define pte_sw_mkyoung	pte_mkyoung
-
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
 static inline int pte_huge(pte_t pte)	{ return pte_val(pte) & _PAGE_HUGE; }
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 8fcdfa52eb4b..70d04931dff4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -424,22 +424,6 @@  static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
-/*
- * On some architectures hardware does not set page access bit when accessing
- * memory page, it is responsibilty of software setting this bit. It brings
- * out extra page fault penalty to track page access bit. For optimization page
- * access bit can be set during all page fault flow on these arches.
- * To be differentiate with macro pte_mkyoung, this macro is used on platforms
- * where software maintains page access bit.
- */
-#ifndef pte_sw_mkyoung
-static inline pte_t pte_sw_mkyoung(pte_t pte)
-{
-	return pte;
-}
-#define pte_sw_mkyoung	pte_sw_mkyoung
-#endif
-
 #ifndef pte_savedwrite
 #define pte_savedwrite pte_write
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f2ca0326b5af..f6719312dc27 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2151,8 +2151,8 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			entry = maybe_mkwrite(entry, vma);
 			if (!write)
 				entry = pte_wrprotect(entry);
-			if (!young)
-				entry = pte_mkold(entry);
+			if (young)
+				entry = pte_mkyoung(entry);
 			if (soft_dirty)
 				entry = pte_mksoft_dirty(entry);
 			if (uffd_wp)
diff --git a/mm/memory.c b/mm/memory.c
index dd1f364d8ca3..4cebba596660 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1639,7 +1639,7 @@  static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
 	get_page(page);
 	inc_mm_counter_fast(mm, mm_counter_file(page));
 	page_add_file_rmap(page, false);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	set_pte_at(mm, addr, pte, pte_mkyoung(mk_pte(page, prot)));
 	return 0;
 }
 
@@ -1954,10 +1954,9 @@  static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	else
 		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
 
-	if (mkwrite) {
-		entry = pte_mkyoung(entry);
+	entry = pte_mkyoung(entry);
+	if (mkwrite)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	}
 
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -2889,7 +2888,7 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = pte_sw_mkyoung(entry);
+		entry = pte_mkyoung(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
@@ -3402,6 +3401,7 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
 	pte = mk_pte(page, vma->vm_page_prot);
+	pte = pte_mkyoung(pte);
 	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -3545,7 +3545,7 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	__SetPageUptodate(page);
 
 	entry = mk_pte(page, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
+	entry = pte_mkyoung(entry);
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
@@ -3821,7 +3821,7 @@  vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
 
 	flush_icache_page(vma, page);
 	entry = mk_pte(page, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
+	entry = pte_mkyoung(entry);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 	/* copy-on-write page */
diff --git a/mm/migrate.c b/mm/migrate.c
index ee5e612b4cd8..d33b2bfc846b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2963,6 +2963,7 @@  static void migrate_vma_insert_page(struct migrate_vma *migrate,
 		}
 	} else {
 		entry = mk_pte(page, vma->vm_page_prot);
+		entry = pte_mkyoung(entry);
 		if (vma->vm_flags & VM_WRITE)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c6b6d8f6c39..4f23b16d6baf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2420,6 +2420,7 @@  static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release;
 
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	_dst_pte = pte_mkyoung(_dst_pte);
 	if (dst_vma->vm_flags & VM_WRITE)
 		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
 	else {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..56c44aa06a7e 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -99,7 +99,7 @@  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
 		goto out_release;
 
-	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+	_dst_pte = pte_mkdirty(pte_mkyoung(mk_pte(page, dst_vma->vm_page_prot)));
 	if (dst_vma->vm_flags & VM_WRITE) {
 		if (wp_copy)
 			_dst_pte = pte_mkuffd_wp(_dst_pte);