diff mbox series

[v2] mm: memory: check userfaultfd_wp() in vmf_orig_pte_uffd_wp()

Message ID 20240418120641.2653165-1-wangkefeng.wang@huawei.com (mailing list archive)
State New
Headers show
Series [v2] mm: memory: check userfaultfd_wp() in vmf_orig_pte_uffd_wp() | expand

Commit Message

Kefeng Wang April 18, 2024, 12:06 p.m. UTC
Add userfaultfd_wp() check in vmf_orig_pte_uffd_wp() to avoid the 
unnecessary pte_marker_entry_uffd_wp() in most pagefault, difference
as shows below from perf data of lat_pagefault, note, the function
vmf_orig_pte_uffd_wp() is not inlined in the two kernel versions.

  perf report -i perf.data.before | grep vmf
     0.17%     0.13%  lat_pagefault  [kernel.kallsyms]      [k] vmf_orig_pte_uffd_wp.part.0.isra.0
  perf report -i perf.data.after  | grep vmf

In addition, directly call vmf_orig_pte_uffd_wp() in do_anonymous_page()
and set_pte_range() to save a uffd_wp variable.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
v2: update changelog

 mm/memory.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

Peter Xu April 18, 2024, 4:32 p.m. UTC | #1
Hi, Kefeng,

On Thu, Apr 18, 2024 at 08:06:41PM +0800, Kefeng Wang wrote:
> Add userfaultfd_wp() check in vmf_orig_pte_uffd_wp() to avoid the 
> unnecessary pte_marker_entry_uffd_wp() in most pagefault, difference
> as shows below from perf data of lat_pagefault, note, the function
> vmf_orig_pte_uffd_wp() is not inlined in the two kernel versions.
> 
>   perf report -i perf.data.before | grep vmf
>      0.17%     0.13%  lat_pagefault  [kernel.kallsyms]      [k] vmf_orig_pte_uffd_wp.part.0.isra.0
>   perf report -i perf.data.after  | grep vmf

Any real number to share too besides the perf greps?  I meant, even if perf
report will not report such function anymore, it doesn't mean it'll be
faster, and how much it improves?

Now we're switching from pte_marker_uffd_wp() check into a vma flag check.
I think it makes more sense to compare the number rather than the perf
reports, as the vma flag check instructions will be buried under other
entries IIUC.

Thanks,

> 
> In addition, directly call vmf_orig_pte_uffd_wp() in do_anonymous_page()
> and set_pte_range() to save a uffd_wp variable.
> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
> v2: update changelog
> 
>  mm/memory.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 5ae2409d3cb9..2cf54def3995 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -116,6 +116,8 @@ static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
>  {
>  	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
>  		return false;
> +	if (!userfaultfd_wp(vmf->vma))
> +		return false;
>  
>  	return pte_marker_uffd_wp(vmf->orig_pte);
>  }
> @@ -4388,7 +4390,6 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>   */
>  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  {
> -	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long addr = vmf->address;
>  	struct folio *folio;
> @@ -4488,7 +4489,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	folio_add_new_anon_rmap(folio, vma, addr);
>  	folio_add_lru_vma(folio, vma);
>  setpte:
> -	if (uffd_wp)
> +	if (vmf_orig_pte_uffd_wp(vmf))
>  		entry = pte_mkuffd_wp(entry);
>  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>  
> @@ -4663,7 +4664,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>  		struct page *page, unsigned int nr, unsigned long addr)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> -	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
>  	pte_t entry;
> @@ -4678,7 +4678,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>  
>  	if (write)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -	if (unlikely(uffd_wp))
> +	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
>  		entry = pte_mkuffd_wp(entry);
>  	/* copy-on-write page */
>  	if (write && !(vma->vm_flags & VM_SHARED)) {
> -- 
> 2.27.0
>
Kefeng Wang April 19, 2024, 3 a.m. UTC | #2
On 2024/4/19 0:32, Peter Xu wrote:
> Hi, Kefeng,
> 
> On Thu, Apr 18, 2024 at 08:06:41PM +0800, Kefeng Wang wrote:
>> Add userfaultfd_wp() check in vmf_orig_pte_uffd_wp() to avoid the
>> unnecessary pte_marker_entry_uffd_wp() in most pagefault, difference
>> as shows below from perf data of lat_pagefault, note, the function
>> vmf_orig_pte_uffd_wp() is not inlined in the two kernel versions.
>>
>>    perf report -i perf.data.before | grep vmf
>>       0.17%     0.13%  lat_pagefault  [kernel.kallsyms]      [k] vmf_orig_pte_uffd_wp.part.0.isra.0
>>    perf report -i perf.data.after  | grep vmf
> 
> Any real number to share too besides the perf greps?  I meant, even if perf
> report will not report such function anymore, it doesn't mean it'll be
> faster, and how much it improves?

dd if=/dev/zero of=/tmp/XXX bs=512M count=1
./lat_pagefault -W 5 -N 5 /tmp/XXX

	before		after	
1	0.2623		0.2605	
2	0.2622		0.2598	
3	0.2621		0.2595	
4	0.2622		0.2600	
5	0.2651		0.2598	
6	0.2624		0.2594	
7	0.2624		0.2605	
8	0.2627		0.2608	
average	0.262675	0.2600375	-0.0026375

The lat_pagefault does show some improvement(also I reboot and retest,
the results are same).

> 
> Now we're switching from pte_marker_uffd_wp() check into a vma flag check.
> I think it makes more sense to compare the number rather than the perf
> reports, as the vma flag check instructions will be buried under other
> entries IIUC.
> 
> Thanks,
>
Peter Xu April 19, 2024, 3:17 p.m. UTC | #3
On Fri, Apr 19, 2024 at 11:00:46AM +0800, Kefeng Wang wrote:
> 
> 
> On 2024/4/19 0:32, Peter Xu wrote:
> > Hi, Kefeng,
> > 
> > On Thu, Apr 18, 2024 at 08:06:41PM +0800, Kefeng Wang wrote:
> > > Add userfaultfd_wp() check in vmf_orig_pte_uffd_wp() to avoid the
> > > unnecessary pte_marker_entry_uffd_wp() in most pagefault, difference
> > > as shows below from perf data of lat_pagefault, note, the function
> > > vmf_orig_pte_uffd_wp() is not inlined in the two kernel versions.
> > > 
> > >    perf report -i perf.data.before | grep vmf
> > >       0.17%     0.13%  lat_pagefault  [kernel.kallsyms]      [k] vmf_orig_pte_uffd_wp.part.0.isra.0
> > >    perf report -i perf.data.after  | grep vmf
> > 
> > Any real number to share too besides the perf greps?  I meant, even if perf
> > report will not report such function anymore, it doesn't mean it'll be
> > faster, and how much it improves?
> 
> dd if=/dev/zero of=/tmp/XXX bs=512M count=1
> ./lat_pagefault -W 5 -N 5 /tmp/XXX
> 
> 	before		after	
> 1	0.2623		0.2605	
> 2	0.2622		0.2598	
> 3	0.2621		0.2595	
> 4	0.2622		0.2600	
> 5	0.2651		0.2598	
> 6	0.2624		0.2594	
> 7	0.2624		0.2605	
> 8	0.2627		0.2608	
> average	0.262675	0.2600375	-0.0026375
> 
> The lat_pagefault does show some improvement(also I reboot and retest,
> the results are same).

Thanks. Could you replace the perf report with these real data?  Or just
append to it.

I had a look at the asm and indeed the current code will generate two
jumps when without this patch, and I don't know why..

   0x0000000000006ac4 <+52>:    test   $0x8,%ah                 <---- check FAULT_FLAG_ORIG_PTE_VALID
   0x0000000000006ac7 <+55>:    jne    0x6bcf <set_pte_range+319>
   0x0000000000006acd <+61>:    mov    0x18(%rbp),%rsi

   ...

   0x0000000000006bcf <+319>:   mov    0x40(%rdi),%rdi
   0x0000000000006bd3 <+323>:   test   $0xffffffffffffff9f,%rdi <---- pte_none() check
   0x0000000000006bda <+330>:   je     0x6acd <set_pte_range+61>
   0x0000000000006be0 <+336>:   test   $0x101,%edi              <---- pte_present() check
   0x0000000000006be6 <+342>:   jne    0x6acd <set_pte_range+61>
   0x0000000000006bec <+348>:   call   0x1c50 <pte_to_swp_entry>
   0x0000000000006bf1 <+353>:   mov    0x0(%rip),%rdx        # 0x6bf8 <set_pte_range+360>
   0x0000000000006bf8 <+360>:   mov    %rax,%r15
   0x0000000000006bfb <+363>:   shr    $0x3a,%rax
   0x0000000000006bff <+367>:   cmp    $0x1f,%rax
   0x0000000000006c03 <+371>:   mov    $0x0,%eax
   0x0000000000006c08 <+376>:   cmovne %rax,%r15
   0x0000000000006c0c <+380>:   mov    0x28(%rbx),%eax
   0x0000000000006c0f <+383>:   and    $0x1,%r15d
   0x0000000000006c13 <+387>:   jmp    0x6acd <set_pte_range+61>

I also don't know why the compiler cannot already merge the none+present
check into one shot, I thought it could.  Also surprised me that
pte_to_swp_entry() is a function call.. but not involved in this context.

So I think I was right it should bypass this when seeing it pte_none,
however that includes two jumps.

And with your patch applied the two jumps are not there:

   0x0000000000006b0c <+124>:   testb  $0x8,0x29(%r14)           <--- FAULT_FLAG_ORIG_PTE_VALID
   0x0000000000006b11 <+129>:   je     0x6b6a <set_pte_range+218>
   0x0000000000006b13 <+131>:   mov    (%r14),%rax
   0x0000000000006b16 <+134>:   testb  $0x10,0x21(%rax)          <--- userfaultfd_wp(vmf->vma) check
   0x0000000000006b1a <+138>:   je     0x6b6a <set_pte_range+218>

Maybe that's what contributes to that 0.x% extra time of a fault.

So if we do care about this 0.x% and we're doing this anyway, perhaps move
the vma check upper?  Because afaict FAULT_FLAG_ORIG_PTE_VALID should
always hit in set_pte_range(), so we can avoid two more insts in the common
paths.

I'll leave that to you too if you want to mention some details in above and
add that into the commit log.

Thanks,
Kefeng Wang April 20, 2024, 4:05 a.m. UTC | #4
On 2024/4/19 23:17, Peter Xu wrote:
> On Fri, Apr 19, 2024 at 11:00:46AM +0800, Kefeng Wang wrote:
>>
>>
>> On 2024/4/19 0:32, Peter Xu wrote:
>>> Hi, Kefeng,
>>>
>>> On Thu, Apr 18, 2024 at 08:06:41PM +0800, Kefeng Wang wrote:
>>>> Add userfaultfd_wp() check in vmf_orig_pte_uffd_wp() to avoid the
>>>> unnecessary pte_marker_entry_uffd_wp() in most pagefault, difference
>>>> as shows below from perf data of lat_pagefault, note, the function
>>>> vmf_orig_pte_uffd_wp() is not inlined in the two kernel versions.
>>>>
>>>>     perf report -i perf.data.before | grep vmf
>>>>        0.17%     0.13%  lat_pagefault  [kernel.kallsyms]      [k] vmf_orig_pte_uffd_wp.part.0.isra.0
>>>>     perf report -i perf.data.after  | grep vmf
>>>
>>> Any real number to share too besides the perf greps?  I meant, even if perf
>>> report will not report such function anymore, it doesn't mean it'll be
>>> faster, and how much it improves?
>>
>> dd if=/dev/zero of=/tmp/XXX bs=512M count=1
>> ./lat_pagefault -W 5 -N 5 /tmp/XXX
>>
>> 	before		after	
>> 1	0.2623		0.2605	
>> 2	0.2622		0.2598	
>> 3	0.2621		0.2595	
>> 4	0.2622		0.2600	
>> 5	0.2651		0.2598	
>> 6	0.2624		0.2594	
>> 7	0.2624		0.2605	
>> 8	0.2627		0.2608	
>> average	0.262675	0.2600375	-0.0026375
>>
>> The lat_pagefault does show some improvement(also I reboot and retest,
>> the results are same).
> 
> Thanks. Could you replace the perf report with these real data?  Or just
> append to it.

Sure, will append it.

> 
> I had a look at the asm and indeed the current code will generate two
> jumps when without this patch, and I don't know why..
> 
>     0x0000000000006ac4 <+52>:    test   $0x8,%ah                 <---- check FAULT_FLAG_ORIG_PTE_VALID
>     0x0000000000006ac7 <+55>:    jne    0x6bcf <set_pte_range+319>
>     0x0000000000006acd <+61>:    mov    0x18(%rbp),%rsi
> 
>     ...
> 
>     0x0000000000006bcf <+319>:   mov    0x40(%rdi),%rdi
>     0x0000000000006bd3 <+323>:   test   $0xffffffffffffff9f,%rdi <---- pte_none() check
>     0x0000000000006bda <+330>:   je     0x6acd <set_pte_range+61>
>     0x0000000000006be0 <+336>:   test   $0x101,%edi              <---- pte_present() check
>     0x0000000000006be6 <+342>:   jne    0x6acd <set_pte_range+61>
>     0x0000000000006bec <+348>:   call   0x1c50 <pte_to_swp_entry>
>     0x0000000000006bf1 <+353>:   mov    0x0(%rip),%rdx        # 0x6bf8 <set_pte_range+360>
>     0x0000000000006bf8 <+360>:   mov    %rax,%r15
>     0x0000000000006bfb <+363>:   shr    $0x3a,%rax
>     0x0000000000006bff <+367>:   cmp    $0x1f,%rax
>     0x0000000000006c03 <+371>:   mov    $0x0,%eax
>     0x0000000000006c08 <+376>:   cmovne %rax,%r15
>     0x0000000000006c0c <+380>:   mov    0x28(%rbx),%eax
>     0x0000000000006c0f <+383>:   and    $0x1,%r15d
>     0x0000000000006c13 <+387>:   jmp    0x6acd <set_pte_range+61>
> 
> I also don't know why the compiler cannot already merge the none+present
> check into one shot, I thought it could.  Also surprised me that
> pte_to_swp_entry() is a function call.. but not involved in this context.
> 
> So I think I was right it should bypass this when seeing it pte_none,
> however that includes two jumps.
> 
> And with your patch applied the two jumps are not there:
> 
>     0x0000000000006b0c <+124>:   testb  $0x8,0x29(%r14)           <--- FAULT_FLAG_ORIG_PTE_VALID
>     0x0000000000006b11 <+129>:   je     0x6b6a <set_pte_range+218>
>     0x0000000000006b13 <+131>:   mov    (%r14),%rax
>     0x0000000000006b16 <+134>:   testb  $0x10,0x21(%rax)          <--- userfaultfd_wp(vmf->vma) check
>     0x0000000000006b1a <+138>:   je     0x6b6a <set_pte_range+218>
> 
> Maybe that's what contributes to that 0.x% extra time of a fault.
> 
> So if we do care about this 0.x% and we're doing this anyway, perhaps move

The latency of lat_pagefault increased a lot than the old kernel(vs 
5.10), except mm counter updating, the another obvious difference
shown from perf graph is the new vmf_orig_pte_uffd_wp().

> the vma check upper?  Because afaict FAULT_FLAG_ORIG_PTE_VALID should
> always hit in set_pte_range(), so we can avoid two more insts in the common
> paths.

Moving it upper is better, and maybe add __always_inline to 
vmf_orig_pte_uffd_wp() to make set_pte_range() only check VM_UFFD_WP 
from vm_flags?

> 
> I'll leave that to you too if you want to mention some details in above and
> add that into the commit log.

Will update the changelog, thanks.
> 
> Thanks,
>
Peter Xu April 21, 2024, 1:53 p.m. UTC | #5
On Sat, Apr 20, 2024 at 12:05:04PM +0800, Kefeng Wang wrote:
> The latency of lat_pagefault increased a lot than the old kernel(vs 5.10),
> except mm counter updating, the another obvious difference
> shown from perf graph is the new vmf_orig_pte_uffd_wp().

Curious how different it is.

I wanted to give it a quick shot over lmbench but fails on missing rpc.h,
at least for the Intel repo.  I think it's because of the libc change to
drop that.  Are you using a separate repo that fixed all things up?

> Moving it upper is better, and maybe add __always_inline to
> vmf_orig_pte_uffd_wp() to make set_pte_range() only check VM_UFFD_WP from
> vm_flags?

Sounds good here, thanks.
Kefeng Wang April 22, 2024, 2:13 a.m. UTC | #6
On 2024/4/21 21:53, Peter Xu wrote:
> On Sat, Apr 20, 2024 at 12:05:04PM +0800, Kefeng Wang wrote:
>> The latency of lat_pagefault increased a lot than the old kernel(vs 5.10),
>> except mm counter updating, the another obvious difference
>> shown from perf graph is the new vmf_orig_pte_uffd_wp().
> 
> Curious how different it is.
> 

It is not big, as shown in perf, only 0.1x% in the whole test, but it is
new added compared with old kernel.

Please check attached perf_0417_pagefault_x86.svg [with batch mm counter],


> I wanted to give it a quick shot over lmbench but fails on missing rpc.h,
> at least for the Intel repo.  I think it's because of the libc change to
> drop that.  Are you using a separate repo that fixed all things up?

yum install libtirpc-devel, I can build with it.

> 
>> Moving it upper is better, and maybe add __always_inline to
>> vmf_orig_pte_uffd_wp() to make set_pte_range() only check VM_UFFD_WP from
>> vm_flags?
> 
> Sounds good here, thanks.

OK, will update it too.

Thanks.
>
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 5ae2409d3cb9..2cf54def3995 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -116,6 +116,8 @@  static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
 {
 	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
 		return false;
+	if (!userfaultfd_wp(vmf->vma))
+		return false;
 
 	return pte_marker_uffd_wp(vmf->orig_pte);
 }
@@ -4388,7 +4390,6 @@  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
  */
 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
-	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long addr = vmf->address;
 	struct folio *folio;
@@ -4488,7 +4489,7 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
-	if (uffd_wp)
+	if (vmf_orig_pte_uffd_wp(vmf))
 		entry = pte_mkuffd_wp(entry);
 	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
 
@@ -4663,7 +4664,6 @@  void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		struct page *page, unsigned int nr, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
 	pte_t entry;
@@ -4678,7 +4678,7 @@  void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	if (unlikely(uffd_wp))
+	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
 		entry = pte_mkuffd_wp(entry);
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {