diff mbox series

[RFC,01/14] userfaultfd: set dirty and young on writeprotect

Message ID 20220718120212.3180-2-namit@vmware.com (mailing list archive)
State New
Headers show
Series [RFC,01/14] userfaultfd: set dirty and young on writeprotect | expand

Commit Message

Nadav Amit July 18, 2022, 12:01 p.m. UTC
From: Nadav Amit <namit@vmware.com>

When userfaultfd makes a PTE writable, it can now change the PTE
directly, in some cases, without going triggering a page-fault first.
Yet, doing so might leave the PTE that was write-unprotected as old and
clean. At least on x86, this would cause a >500 cycles overhead when the
PTE is first accessed.

Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
gets a hint that the page is likely to be used. Avoid changing the PTE
to young and dirty in other cases to avoid excessive writeback and
messing with the page reclamation logic.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
---
 include/linux/mm.h | 2 ++
 mm/mprotect.c      | 9 ++++++++-
 mm/userfaultfd.c   | 8 ++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

Comments

Peter Xu July 19, 2022, 8:47 p.m. UTC | #1
On Mon, Jul 18, 2022 at 05:01:59AM -0700, Nadav Amit wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> When userfaultfd makes a PTE writable, it can now change the PTE
> directly, in some cases, without going triggering a page-fault first.
> Yet, doing so might leave the PTE that was write-unprotected as old and
> clean. At least on x86, this would cause a >500 cycles overhead when the
> PTE is first accessed.
> 
> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
> gets a hint that the page is likely to be used. Avoid changing the PTE
> to young and dirty in other cases to avoid excessive writeback and
> messing with the page reclamation logic.
> 
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Will Deacon <will@kernel.org>
> Cc: Yu Zhao <yuzhao@google.com>
> Cc: Nick Piggin <npiggin@gmail.com>
> ---
>  include/linux/mm.h | 2 ++
>  mm/mprotect.c      | 9 ++++++++-
>  mm/userfaultfd.c   | 8 ++++++--
>  3 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9cc02a7e503b..4afd75ce5875 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>  /* Whether this change is for write protecting */
>  #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
>  #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
> +/* Whether to try to mark entries as dirty as they are to be written */
> +#define  MM_CP_WILL_NEED		   (1UL << 4)
>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
>  					    MM_CP_UFFD_WP_RESOLVE)
>  
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 996a97e213ad..34c2dfb68c42 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> +	bool will_need = cp_flags & MM_CP_WILL_NEED;
>  
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  
> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  				ptent = pte_clear_uffd_wp(ptent);
>  			}
>  
> +			if (will_need)
> +				ptent = pte_mkyoung(ptent);

For uffd path, UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY are new
internal flags used with or without the new feature bit set.  It means even
with !ACCESS_HINT we'll start to set young bit while we used not to?  Is
that some kind of a light abi change?

I'd suggest we only set will_need if ACCESS_HINT is set.

> +
>  			/*
>  			 * In some writable, shared mappings, we might want
>  			 * to catch actual write access -- see
> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  			 */
>  			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>  			    !pte_write(ptent) &&
> -			    can_change_pte_writable(vma, addr, ptent))
> +			    can_change_pte_writable(vma, addr, ptent)) {
>  				ptent = pte_mkwrite(ptent);
> +				if (will_need)
> +					ptent = pte_mkdirty(ptent);

Can we make this unconditional?  IOW to cover both:

  (1) When will_need is not set, or
  (2) mprotect() too

David's patch is good in that we merged the unprotect and CoW.  However
that's not complete because the dirty bit ops are missing.

Here IMHO we should have a standalone patch to just add the dirty bit into
this logic when we'll grant write bit.  IMHO it'll make the write+dirty
bits coherent again in all paths.

> +			}
>  
>  			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
>  			if (pte_needs_flush(oldpte, ptent))
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 954c6980b29f..e0492f5f06a0 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -749,6 +749,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  	bool enable_wp = uffd_flags & UFFD_FLAGS_WP;
>  	struct vm_area_struct *dst_vma;
>  	unsigned long page_mask;
> +	unsigned long cp_flags;
>  	struct mmu_gather tlb;
>  	pgprot_t newprot;
>  	int err;
> @@ -795,9 +796,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  	else
>  		newprot = vm_get_page_prot(dst_vma->vm_flags);
>  
> +	cp_flags = enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE;
> +	if (uffd_flags & (UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY))
> +		cp_flags |= MM_CP_WILL_NEED;
> +
>  	tlb_gather_mmu(&tlb, dst_mm);
> -	change_protection(&tlb, dst_vma, start, start + len, newprot,
> -			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
> +	change_protection(&tlb, dst_vma, start, start + len, newprot, cp_flags);
>  	tlb_finish_mmu(&tlb);
>  
>  	err = 0;
> -- 
> 2.25.1
>
David Hildenbrand July 20, 2022, 9:39 a.m. UTC | #2
On 19.07.22 22:47, Peter Xu wrote:
> On Mon, Jul 18, 2022 at 05:01:59AM -0700, Nadav Amit wrote:
>> From: Nadav Amit <namit@vmware.com>
>>
>> When userfaultfd makes a PTE writable, it can now change the PTE
>> directly, in some cases, without going triggering a page-fault first.
>> Yet, doing so might leave the PTE that was write-unprotected as old and
>> clean. At least on x86, this would cause a >500 cycles overhead when the
>> PTE is first accessed.
>>
>> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
>> gets a hint that the page is likely to be used. Avoid changing the PTE
>> to young and dirty in other cases to avoid excessive writeback and
>> messing with the page reclamation logic.
>>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Yu Zhao <yuzhao@google.com>
>> Cc: Nick Piggin <npiggin@gmail.com>
>> ---
>>  include/linux/mm.h | 2 ++
>>  mm/mprotect.c      | 9 ++++++++-
>>  mm/userfaultfd.c   | 8 ++++++--
>>  3 files changed, 16 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9cc02a7e503b..4afd75ce5875 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>>  /* Whether this change is for write protecting */
>>  #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
>>  #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
>> +/* Whether to try to mark entries as dirty as they are to be written */
>> +#define  MM_CP_WILL_NEED		   (1UL << 4)
>>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
>>  					    MM_CP_UFFD_WP_RESOLVE)
>>  
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 996a97e213ad..34c2dfb68c42 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>> +	bool will_need = cp_flags & MM_CP_WILL_NEED;
>>  
>>  	tlb_change_page_size(tlb, PAGE_SIZE);
>>  
>> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>  				ptent = pte_clear_uffd_wp(ptent);
>>  			}
>>  
>> +			if (will_need)
>> +				ptent = pte_mkyoung(ptent);
> 
> For uffd path, UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY are new
> internal flags used with or without the new feature bit set.  It means even
> with !ACCESS_HINT we'll start to set young bit while we used not to?  Is
> that some kind of a light abi change?
> 
> I'd suggest we only set will_need if ACCESS_HINT is set.
> 
>> +
>>  			/*
>>  			 * In some writable, shared mappings, we might want
>>  			 * to catch actual write access -- see
>> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>  			 */
>>  			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>>  			    !pte_write(ptent) &&
>> -			    can_change_pte_writable(vma, addr, ptent))
>> +			    can_change_pte_writable(vma, addr, ptent)) {
>>  				ptent = pte_mkwrite(ptent);
>> +				if (will_need)
>> +					ptent = pte_mkdirty(ptent);
> 
> Can we make this unconditional?  IOW to cover both:
> 
>   (1) When will_need is not set, or
>   (2) mprotect() too
> 
> David's patch is good in that we merged the unprotect and CoW.  However
> that's not complete because the dirty bit ops are missing.
> 
> Here IMHO we should have a standalone patch to just add the dirty bit into
> this logic when we'll grant write bit.  IMHO it'll make the write+dirty
> bits coherent again in all paths.

I'm not sure I follow.

We *surely* don't want to dirty random pages (especially once in the
pagecache/swapcache) simply because we change protection.

Just like we don't set all pages write+dirty in a writable VMA on a read
fault.
David Hildenbrand July 20, 2022, 9:42 a.m. UTC | #3
On 18.07.22 14:01, Nadav Amit wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> When userfaultfd makes a PTE writable, it can now change the PTE
> directly, in some cases, without going triggering a page-fault first.
> Yet, doing so might leave the PTE that was write-unprotected as old and
> clean. At least on x86, this would cause a >500 cycles overhead when the
> PTE is first accessed.
> 
> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
> gets a hint that the page is likely to be used. Avoid changing the PTE
> to young and dirty in other cases to avoid excessive writeback and
> messing with the page reclamation logic.
> 
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Will Deacon <will@kernel.org>
> Cc: Yu Zhao <yuzhao@google.com>
> Cc: Nick Piggin <npiggin@gmail.com>
> ---
>  include/linux/mm.h | 2 ++
>  mm/mprotect.c      | 9 ++++++++-
>  mm/userfaultfd.c   | 8 ++++++--
>  3 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9cc02a7e503b..4afd75ce5875 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>  /* Whether this change is for write protecting */
>  #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
>  #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
> +/* Whether to try to mark entries as dirty as they are to be written */
> +#define  MM_CP_WILL_NEED		   (1UL << 4)
>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
>  					    MM_CP_UFFD_WP_RESOLVE)
>  
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 996a97e213ad..34c2dfb68c42 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> +	bool will_need = cp_flags & MM_CP_WILL_NEED;
>  
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  
> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  				ptent = pte_clear_uffd_wp(ptent);
>  			}
>  
> +			if (will_need)
> +				ptent = pte_mkyoung(ptent);
> +
>  			/*
>  			 * In some writable, shared mappings, we might want
>  			 * to catch actual write access -- see
> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>  			 */
>  			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>  			    !pte_write(ptent) &&


Why would we want to check if we can set something writable if it
already *is* writable? That doesn't make sense to me.

> -			    can_change_pte_writable(vma, addr, ptent))
> +			    can_change_pte_writable(vma, addr, ptent)) {
>  				ptent = pte_mkwrite(ptent);
> +				if (will_need)
> +					ptent = pte_mkdirty(ptent);
> +			}
Peter Xu July 20, 2022, 1:10 p.m. UTC | #4
On Wed, Jul 20, 2022 at 11:39:23AM +0200, David Hildenbrand wrote:
> On 19.07.22 22:47, Peter Xu wrote:
> > On Mon, Jul 18, 2022 at 05:01:59AM -0700, Nadav Amit wrote:
> >> From: Nadav Amit <namit@vmware.com>
> >>
> >> When userfaultfd makes a PTE writable, it can now change the PTE
> >> directly, in some cases, without going triggering a page-fault first.
> >> Yet, doing so might leave the PTE that was write-unprotected as old and
> >> clean. At least on x86, this would cause a >500 cycles overhead when the
> >> PTE is first accessed.
> >>
> >> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
> >> gets a hint that the page is likely to be used. Avoid changing the PTE
> >> to young and dirty in other cases to avoid excessive writeback and
> >> messing with the page reclamation logic.
> >>
> >> Cc: Andrea Arcangeli <aarcange@redhat.com>
> >> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> Cc: Andy Lutomirski <luto@kernel.org>
> >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >> Cc: David Hildenbrand <david@redhat.com>
> >> Cc: Peter Xu <peterx@redhat.com>
> >> Cc: Peter Zijlstra <peterz@infradead.org>
> >> Cc: Thomas Gleixner <tglx@linutronix.de>
> >> Cc: Will Deacon <will@kernel.org>
> >> Cc: Yu Zhao <yuzhao@google.com>
> >> Cc: Nick Piggin <npiggin@gmail.com>
> >> ---
> >>  include/linux/mm.h | 2 ++
> >>  mm/mprotect.c      | 9 ++++++++-
> >>  mm/userfaultfd.c   | 8 ++++++--
> >>  3 files changed, 16 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index 9cc02a7e503b..4afd75ce5875 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
> >>  /* Whether this change is for write protecting */
> >>  #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
> >>  #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
> >> +/* Whether to try to mark entries as dirty as they are to be written */
> >> +#define  MM_CP_WILL_NEED		   (1UL << 4)
> >>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
> >>  					    MM_CP_UFFD_WP_RESOLVE)
> >>  
> >> diff --git a/mm/mprotect.c b/mm/mprotect.c
> >> index 996a97e213ad..34c2dfb68c42 100644
> >> --- a/mm/mprotect.c
> >> +++ b/mm/mprotect.c
> >> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> >>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
> >>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
> >>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> >> +	bool will_need = cp_flags & MM_CP_WILL_NEED;
> >>  
> >>  	tlb_change_page_size(tlb, PAGE_SIZE);
> >>  
> >> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> >>  				ptent = pte_clear_uffd_wp(ptent);
> >>  			}
> >>  
> >> +			if (will_need)
> >> +				ptent = pte_mkyoung(ptent);
> > 
> > For uffd path, UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY are new
> > internal flags used with or without the new feature bit set.  It means even
> > with !ACCESS_HINT we'll start to set young bit while we used not to?  Is
> > that some kind of a light abi change?
> > 
> > I'd suggest we only set will_need if ACCESS_HINT is set.
> > 
> >> +
> >>  			/*
> >>  			 * In some writable, shared mappings, we might want
> >>  			 * to catch actual write access -- see
> >> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> >>  			 */
> >>  			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
> >>  			    !pte_write(ptent) &&
> >> -			    can_change_pte_writable(vma, addr, ptent))
> >> +			    can_change_pte_writable(vma, addr, ptent)) {
> >>  				ptent = pte_mkwrite(ptent);
> >> +				if (will_need)
> >> +					ptent = pte_mkdirty(ptent);
> > 
> > Can we make this unconditional?  IOW to cover both:
> > 
> >   (1) When will_need is not set, or
> >   (2) mprotect() too
> > 
> > David's patch is good in that we merged the unprotect and CoW.  However
> > that's not complete because the dirty bit ops are missing.
> > 
> > Here IMHO we should have a standalone patch to just add the dirty bit into
> > this logic when we'll grant write bit.  IMHO it'll make the write+dirty
> > bits coherent again in all paths.
> 
> I'm not sure I follow.
> 
> We *surely* don't want to dirty random pages (especially once in the
> pagecache/swapcache) simply because we change protection.
> 
> Just like we don't set all pages write+dirty in a writable VMA on a read
> fault.

IMO unmprotect (in generic mprotect form or uffd form) has a stronger sign
of page being written, unlike read faults, as many of them happen because
page being written and message generated.

But yeah you have a point too, maybe we shouldn't assume such a condition.
Especially as long as we won't set write=1 without soft-dirty tracking
enabled, I think it should be safe.
David Hildenbrand July 20, 2022, 3:10 p.m. UTC | #5
On 20.07.22 15:10, Peter Xu wrote:
> On Wed, Jul 20, 2022 at 11:39:23AM +0200, David Hildenbrand wrote:
>> On 19.07.22 22:47, Peter Xu wrote:
>>> On Mon, Jul 18, 2022 at 05:01:59AM -0700, Nadav Amit wrote:
>>>> From: Nadav Amit <namit@vmware.com>
>>>>
>>>> When userfaultfd makes a PTE writable, it can now change the PTE
>>>> directly, in some cases, without going triggering a page-fault first.
>>>> Yet, doing so might leave the PTE that was write-unprotected as old and
>>>> clean. At least on x86, this would cause a >500 cycles overhead when the
>>>> PTE is first accessed.
>>>>
>>>> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
>>>> gets a hint that the page is likely to be used. Avoid changing the PTE
>>>> to young and dirty in other cases to avoid excessive writeback and
>>>> messing with the page reclamation logic.
>>>>
>>>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Andy Lutomirski <luto@kernel.org>
>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>>> Cc: David Hildenbrand <david@redhat.com>
>>>> Cc: Peter Xu <peterx@redhat.com>
>>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>> Cc: Will Deacon <will@kernel.org>
>>>> Cc: Yu Zhao <yuzhao@google.com>
>>>> Cc: Nick Piggin <npiggin@gmail.com>
>>>> ---
>>>>  include/linux/mm.h | 2 ++
>>>>  mm/mprotect.c      | 9 ++++++++-
>>>>  mm/userfaultfd.c   | 8 ++++++--
>>>>  3 files changed, 16 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 9cc02a7e503b..4afd75ce5875 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>>>>  /* Whether this change is for write protecting */
>>>>  #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
>>>>  #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
>>>> +/* Whether to try to mark entries as dirty as they are to be written */
>>>> +#define  MM_CP_WILL_NEED		   (1UL << 4)
>>>>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
>>>>  					    MM_CP_UFFD_WP_RESOLVE)
>>>>  
>>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>>> index 996a97e213ad..34c2dfb68c42 100644
>>>> --- a/mm/mprotect.c
>>>> +++ b/mm/mprotect.c
>>>> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>>>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>>>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>>>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>>>> +	bool will_need = cp_flags & MM_CP_WILL_NEED;
>>>>  
>>>>  	tlb_change_page_size(tlb, PAGE_SIZE);
>>>>  
>>>> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>>>  				ptent = pte_clear_uffd_wp(ptent);
>>>>  			}
>>>>  
>>>> +			if (will_need)
>>>> +				ptent = pte_mkyoung(ptent);
>>>
>>> For uffd path, UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY are new
>>> internal flags used with or without the new feature bit set.  It means even
>>> with !ACCESS_HINT we'll start to set young bit while we used not to?  Is
>>> that some kind of a light abi change?
>>>
>>> I'd suggest we only set will_need if ACCESS_HINT is set.
>>>
>>>> +
>>>>  			/*
>>>>  			 * In some writable, shared mappings, we might want
>>>>  			 * to catch actual write access -- see
>>>> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>>>  			 */
>>>>  			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>>>>  			    !pte_write(ptent) &&
>>>> -			    can_change_pte_writable(vma, addr, ptent))
>>>> +			    can_change_pte_writable(vma, addr, ptent)) {
>>>>  				ptent = pte_mkwrite(ptent);
>>>> +				if (will_need)
>>>> +					ptent = pte_mkdirty(ptent);
>>>
>>> Can we make this unconditional?  IOW to cover both:
>>>
>>>   (1) When will_need is not set, or
>>>   (2) mprotect() too
>>>
>>> David's patch is good in that we merged the unprotect and CoW.  However
>>> that's not complete because the dirty bit ops are missing.
>>>
>>> Here IMHO we should have a standalone patch to just add the dirty bit into
>>> this logic when we'll grant write bit.  IMHO it'll make the write+dirty
>>> bits coherent again in all paths.
>>
>> I'm not sure I follow.
>>
>> We *surely* don't want to dirty random pages (especially once in the
>> pagecache/swapcache) simply because we change protection.
>>
>> Just like we don't set all pages write+dirty in a writable VMA on a read
>> fault.
> 
> IMO unmprotect (in generic mprotect form or uffd form) has a stronger sign
> of page being written, unlike read faults, as many of them happen because
> page being written and message generated.

I'm sorry, but I am very skeptical about such statements. I don't buy it.

> 
> But yeah you have a point too, maybe we shouldn't assume such a condition.
> Especially as long as we won't set write=1 without soft-dirty tracking
> enabled, I think it should be safe.

For pagecache pages it may as well be *plain wrong* to bypass the write
fault handler and simply mark pages dirty+map them writable.

Please, let's keep this protection change handler here as simple as
possible and *not* try to replicate the whole write fault handler logic
in here by relying on statements as above.

If we try optimizing for corner cases by making the implementation here
overly complicated, then we are clearly doing something wrong.
Nadav Amit July 20, 2022, 5:36 p.m. UTC | #6
On Jul 20, 2022, at 2:42 AM, David Hildenbrand <david@redhat.com> wrote:

> ⚠ External Email
> 
> On 18.07.22 14:01, Nadav Amit wrote:
>> From: Nadav Amit <namit@vmware.com>
>> 
>> When userfaultfd makes a PTE writable, it can now change the PTE
>> directly, in some cases, without going triggering a page-fault first.
>> Yet, doing so might leave the PTE that was write-unprotected as old and
>> clean. At least on x86, this would cause a >500 cycles overhead when the
>> PTE is first accessed.
>> 
>> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
>> gets a hint that the page is likely to be used. Avoid changing the PTE
>> to young and dirty in other cases to avoid excessive writeback and
>> messing with the page reclamation logic.
>> 
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Yu Zhao <yuzhao@google.com>
>> Cc: Nick Piggin <npiggin@gmail.com>
>> ---
>> include/linux/mm.h | 2 ++
>> mm/mprotect.c | 9 ++++++++-
>> mm/userfaultfd.c | 8 ++++++--
>> 3 files changed, 16 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9cc02a7e503b..4afd75ce5875 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>> /* Whether this change is for write protecting */
>> #define MM_CP_UFFD_WP (1UL << 2) /* do wp */
>> #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
>> +/* Whether to try to mark entries as dirty as they are to be written */
>> +#define MM_CP_WILL_NEED (1UL << 4)
>> #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
>> MM_CP_UFFD_WP_RESOLVE)
>> 
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 996a97e213ad..34c2dfb68c42 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>> bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>> bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>> bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>> + bool will_need = cp_flags & MM_CP_WILL_NEED;
>> 
>> tlb_change_page_size(tlb, PAGE_SIZE);
>> 
>> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>> ptent = pte_clear_uffd_wp(ptent);
>> }
>> 
>> + if (will_need)
>> + ptent = pte_mkyoung(ptent);
>> +
>> /*
>> * In some writable, shared mappings, we might want
>> * to catch actual write access -- see
>> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>> */
>> if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>> !pte_write(ptent) &&
> 
> 
> Why would we want to check if we can set something writable if it
> already *is* writable? That doesn't make sense to me.

We check !pte_write(). What am I missing in your question?

Having said that, I do notice now that pte_mkdirty() should not be done
only this condition is fulfilled. Instead we should just have
something like:

                       if (will_need) {
                               ptent = pte_mkyoung(ptent);
                               if (pte_write(ptent))
                                       ptent = pte_mkdirty(ptent);
                       }

But I do not think this answers your question, which I did not understand.
David Hildenbrand July 20, 2022, 6 p.m. UTC | #7
On 20.07.22 19:36, Nadav Amit wrote:
> On Jul 20, 2022, at 2:42 AM, David Hildenbrand <david@redhat.com> wrote:
> 
>> ⚠ External Email
>>
>> On 18.07.22 14:01, Nadav Amit wrote:
>>> From: Nadav Amit <namit@vmware.com>
>>>
>>> When userfaultfd makes a PTE writable, it can now change the PTE
>>> directly, in some cases, without going triggering a page-fault first.
>>> Yet, doing so might leave the PTE that was write-unprotected as old and
>>> clean. At least on x86, this would cause a >500 cycles overhead when the
>>> PTE is first accessed.
>>>
>>> Use MM_CP_WILL_NEED to set the PTE as young and dirty when userfaultfd
>>> gets a hint that the page is likely to be used. Avoid changing the PTE
>>> to young and dirty in other cases to avoid excessive writeback and
>>> messing with the page reclamation logic.
>>>
>>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Andy Lutomirski <luto@kernel.org>
>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: Peter Xu <peterx@redhat.com>
>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>> Cc: Will Deacon <will@kernel.org>
>>> Cc: Yu Zhao <yuzhao@google.com>
>>> Cc: Nick Piggin <npiggin@gmail.com>
>>> ---
>>> include/linux/mm.h | 2 ++
>>> mm/mprotect.c | 9 ++++++++-
>>> mm/userfaultfd.c | 8 ++++++--
>>> 3 files changed, 16 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 9cc02a7e503b..4afd75ce5875 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -1988,6 +1988,8 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
>>> /* Whether this change is for write protecting */
>>> #define MM_CP_UFFD_WP (1UL << 2) /* do wp */
>>> #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
>>> +/* Whether to try to mark entries as dirty as they are to be written */
>>> +#define MM_CP_WILL_NEED (1UL << 4)
>>> #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
>>> MM_CP_UFFD_WP_RESOLVE)
>>>
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index 996a97e213ad..34c2dfb68c42 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -82,6 +82,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>> bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>> bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>> bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>>> + bool will_need = cp_flags & MM_CP_WILL_NEED;
>>>
>>> tlb_change_page_size(tlb, PAGE_SIZE);
>>>
>>> @@ -172,6 +173,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>> ptent = pte_clear_uffd_wp(ptent);
>>> }
>>>
>>> + if (will_need)
>>> + ptent = pte_mkyoung(ptent);
>>> +
>>> /*
>>> * In some writable, shared mappings, we might want
>>> * to catch actual write access -- see
>>> @@ -187,8 +191,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
>>> */
>>> if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
>>> !pte_write(ptent) &&
>>
>>
>> Why would we want to check if we can set something writable if it
>> already *is* writable? That doesn't make sense to me.
> 
> We check !pte_write(). What am I missing in your question?

My patch review skills have seen better days. I thought you'd be
removing the pte_write() check ... :( Tired eyes ...

> 
> Having said that, I do notice now that pte_mkdirty() should not be done
> only this condition is fulfilled. Instead we should just have
> something like:
> 
>                        if (will_need) {
>                                ptent = pte_mkyoung(ptent);
>                                if (pte_write(ptent))
>                                        ptent = pte_mkdirty(ptent);
>                        }

As can_change_pte_writable() will fail if it stumbles over a !pte_dirty
page in current code ... so I assume you would have that code before the
actual pte_mkwrite() logic, correct?
Nadav Amit July 20, 2022, 6:09 p.m. UTC | #8
On Jul 20, 2022, at 11:00 AM, David Hildenbrand <david@redhat.com> wrote:

> My patch review skills have seen better days. I thought you'd be
> removing the pte_write() check ... :( Tired eyes ...
> 
>> Having said that, I do notice now that pte_mkdirty() should not be done
>> only this condition is fulfilled. Instead we should just have
>> something like:
>> 
>> if (will_need) {
>> ptent = pte_mkyoung(ptent);
>> if (pte_write(ptent))
>> ptent = pte_mkdirty(ptent);
>> }
> 
> As can_change_pte_writable() will fail if it stumbles over a !pte_dirty
> page in current code ... so I assume you would have that code before the
> actual pte_mkwrite() logic, correct?

No, I thought this should go after for 2 reasons:

1. You want to allow the PTE to be made writable following the
can_change_pte_writable().

2. You do not want to set a non-writable PTE as dirty, especially since it
might then be used to determine that the PTE can become writable. Doing so
would circumvent cases in which set_page_dirty() needs to be called and
break things down.
David Hildenbrand July 20, 2022, 6:11 p.m. UTC | #9
On 20.07.22 20:09, Nadav Amit wrote:
> On Jul 20, 2022, at 11:00 AM, David Hildenbrand <david@redhat.com> wrote:
> 
>> My patch review skills have seen better days. I thought you'd be
>> removing the pte_write() check ... :( Tired eyes ...
>>
>>> Having said that, I do notice now that pte_mkdirty() should not be done
>>> only this condition is fulfilled. Instead we should just have
>>> something like:
>>>
>>> if (will_need) {
>>> ptent = pte_mkyoung(ptent);
>>> if (pte_write(ptent))
>>> ptent = pte_mkdirty(ptent);
>>> }
>>
>> As can_change_pte_writable() will fail if it stumbles over a !pte_dirty
>> page in current code ... so I assume you would have that code before the
>> actual pte_mkwrite() logic, correct?
> 
> No, I thought this should go after for 2 reasons:
> 
> 1. You want to allow the PTE to be made writable following the
> can_change_pte_writable().
> 
> 2. You do not want to set a non-writable PTE as dirty, especially since it
> might then be used to determine that the PTE can become writable. Doing so
> would circumvent cases in which set_page_dirty() needs to be called and
> break things down.

The I'm confused how can_change_pte_writable() would ever allow for
that. Best to show me the code :)
Peter Xu July 20, 2022, 7:15 p.m. UTC | #10
On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
> For pagecache pages it may as well be *plain wrong* to bypass the write
> fault handler and simply mark pages dirty+map them writable.

Could you elaborate?
David Hildenbrand July 20, 2022, 7:33 p.m. UTC | #11
On 20.07.22 21:15, Peter Xu wrote:
> On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
>> For pagecache pages it may as well be *plain wrong* to bypass the write
>> fault handler and simply mark pages dirty+map them writable.
> 
> Could you elaborate?

Write-fault handling for some filesystems (that even require this
"slow path") is a bit special.

For example, do_shared_fault() might have to call page_mkwrite().

AFAIK file systems use that for lazy allocation of disk blocks.
If you simply go ahead and map a !dirty pagecache page writable
and mark it dirty, it will not trigger page_mkwrite() and you might
end up corrupting data.

That's why we the old change_pte_range() code never touched
anything if the pte wasn't already dirty. Because as long as it's not writable,
the FS might have to be informed about the write-unprotect.

And we end up in the case here for VM_SHARED with vma_wants_writenotify().
Where we, for example, check

/* The backer wishes to know when pages are first written to? *
if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))$
	return 1;


Long story short, we should be really careful with write-fault handler bypasses,
especially when deciding to set dirty bits. For pagecache pages, we have to be
especially careful.

For exclusive anon pages it's mostly ok, because wp_page_reuse()
doesn't really contain that much magic. The only thing to consider for ordinary
mprotect() is that there is -- IMHO -- absolutely no guarantee that someone will
write to a specific write-unprotected page soon. For uffd-wp-unprotect it might be
easier to guess, especially, if we un-protect only a single page.
Peter Xu July 20, 2022, 7:48 p.m. UTC | #12
On Wed, Jul 20, 2022 at 09:33:35PM +0200, David Hildenbrand wrote:
> On 20.07.22 21:15, Peter Xu wrote:
> > On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
> >> For pagecache pages it may as well be *plain wrong* to bypass the write
> >> fault handler and simply mark pages dirty+map them writable.
> > 
> > Could you elaborate?
> 
> Write-fault handling for some filesystems (that even require this
> "slow path") is a bit special.
> 
> For example, do_shared_fault() might have to call page_mkwrite().
> 
> AFAIK file systems use that for lazy allocation of disk blocks.
> If you simply go ahead and map a !dirty pagecache page writable
> and mark it dirty, it will not trigger page_mkwrite() and you might
> end up corrupting data.
> 
> That's why we the old change_pte_range() code never touched
> anything if the pte wasn't already dirty.

I don't think that pte_dirty() check was for the pagecache code. For any fs
that has page_mkwrite() defined, it'll already have vma_wants_writenotify()
return 1, so we'll never try to add write bit, hence we'll never even try
to check pte_dirty().

> Because as long as it's not writable,
> the FS might have to be informed about the write-unprotect.
> 
> And we end up in the case here for VM_SHARED with vma_wants_writenotify().
> Where we, for example, check
> 
> /* The backer wishes to know when pages are first written to? *
> if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))$
> 	return 1;
> 
> 
> Long story short, we should be really careful with write-fault handler bypasses,
> especially when deciding to set dirty bits. For pagecache pages, we have to be
> especially careful.

Since you mentioned page_mkwrite, IMHO it's really the write bit not dirty
bit that matters here (and IMHO that's why it's called page_mkwrite() not
page_mkdirty()).  Here Nadav's patch added pte_mkdirty() only if
pte_mkwrite() happens.  So I'm a bit confused on what's your worry, and
what you're against doing.

Say, even if with my original proposal to set dirty unconditionally, it'll
be still be after the pte_mkwrite().  I never see how that could affect
page_mkwrite not to mention it'll not even reach there.

> 
> For exclusive anon pages it's mostly ok, because wp_page_reuse()
> doesn't really contain that much magic. The only thing to consider for ordinary
> mprotect() is that there is -- IMHO -- absolutely no guarantee that someone will
> write to a specific write-unprotected page soon. For uffd-wp-unprotect it might be
> easier to guess, especially, if we un-protect only a single page.

Yeh, as mentioned I think that's a valid point - looks good to me to attach
the dirty bit only when with a hint.

Thanks,
David Hildenbrand July 20, 2022, 7:55 p.m. UTC | #13
On 20.07.22 21:48, Peter Xu wrote:
> On Wed, Jul 20, 2022 at 09:33:35PM +0200, David Hildenbrand wrote:
>> On 20.07.22 21:15, Peter Xu wrote:
>>> On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
>>>> For pagecache pages it may as well be *plain wrong* to bypass the write
>>>> fault handler and simply mark pages dirty+map them writable.
>>>
>>> Could you elaborate?
>>
>> Write-fault handling for some filesystems (that even require this
>> "slow path") is a bit special.
>>
>> For example, do_shared_fault() might have to call page_mkwrite().
>>
>> AFAIK file systems use that for lazy allocation of disk blocks.
>> If you simply go ahead and map a !dirty pagecache page writable
>> and mark it dirty, it will not trigger page_mkwrite() and you might
>> end up corrupting data.
>>
>> That's why we the old change_pte_range() code never touched
>> anything if the pte wasn't already dirty.
> 
> I don't think that pte_dirty() check was for the pagecache code. For any fs
> that has page_mkwrite() defined, it'll already have vma_wants_writenotify()
> return 1, so we'll never try to add write bit, hence we'll never even try
> to check pte_dirty().
> 

I might be too tired, but the whole reason we had this magic before my
commit in place was only for the pagecache.

With vma_wants_writenotify()=0 you can directly map the pages writable
and don't have to do these advanced checks here. In a writable
MAP_SHARED VMA you'll already have pte_write().

We only get !pte_write() in case we have vma_wants_writenotify()=1 ...

  try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);

and that's the code that checked the dirty bit after all to decide --
amongst other things -- if we can simply map it writable without going
via the write fault handler and triggering do_shared_fault() .

See crazy/ugly FOLL_FORCE code in GUP that similarly checks the dirty bit.

But yeah, it's all confusing so I might just be wrong regarding
pagecache pages.
Nadav Amit July 20, 2022, 8:22 p.m. UTC | #14
On Jul 20, 2022, at 12:55 PM, David Hildenbrand <david@redhat.com> wrote:

> On 20.07.22 21:48, Peter Xu wrote:
>> On Wed, Jul 20, 2022 at 09:33:35PM +0200, David Hildenbrand wrote:
>>> On 20.07.22 21:15, Peter Xu wrote:
>>>> On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
>>>>> For pagecache pages it may as well be *plain wrong* to bypass the write
>>>>> fault handler and simply mark pages dirty+map them writable.
>>>> 
>>>> Could you elaborate?
>>> 
>>> Write-fault handling for some filesystems (that even require this
>>> "slow path") is a bit special.
>>> 
>>> For example, do_shared_fault() might have to call page_mkwrite().
>>> 
>>> AFAIK file systems use that for lazy allocation of disk blocks.
>>> If you simply go ahead and map a !dirty pagecache page writable
>>> and mark it dirty, it will not trigger page_mkwrite() and you might
>>> end up corrupting data.
>>> 
>>> That's why we the old change_pte_range() code never touched
>>> anything if the pte wasn't already dirty.
>> 
>> I don't think that pte_dirty() check was for the pagecache code. For any fs
>> that has page_mkwrite() defined, it'll already have vma_wants_writenotify()
>> return 1, so we'll never try to add write bit, hence we'll never even try
>> to check pte_dirty().
> 
> I might be too tired, but the whole reason we had this magic before my
> commit in place was only for the pagecache.
> 
> With vma_wants_writenotify()=0 you can directly map the pages writable
> and don't have to do these advanced checks here. In a writable
> MAP_SHARED VMA you'll already have pte_write().
> 
> We only get !pte_write() in case we have vma_wants_writenotify()=1 ...
> 
>  try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
> 
> and that's the code that checked the dirty bit after all to decide --
> amongst other things -- if we can simply map it writable without going
> via the write fault handler and triggering do_shared_fault() .
> 
> See crazy/ugly FOLL_FORCE code in GUP that similarly checks the dirty bit.

I thought you want to get rid of it at least for anonymous pages. No?

> 
> But yeah, it's all confusing so I might just be wrong regarding
> pagecache pages.

Just to note: I am not very courageous and I did not intend to change
condition for when non-anonymous pages are set as writable. That’s the
reason I did not change the dirty for non-writable non-anonymous entries (as
Peter said). And that’s the reason that setting the dirty bit (at least as I
should have done it) is only performed after we made the decision on the
write-bit.

IOW, after you made your decision about the write-bit, then and only then
you may be able to set the dirty bit for writable entries. Since the entry
is already writeable (i.e., can be written without a fault later directly
from userspace), there should be no concern of correctness when you set it.
David Hildenbrand July 20, 2022, 8:38 p.m. UTC | #15
On 20.07.22 22:22, Nadav Amit wrote:
> On Jul 20, 2022, at 12:55 PM, David Hildenbrand <david@redhat.com> wrote:
> 
>> On 20.07.22 21:48, Peter Xu wrote:
>>> On Wed, Jul 20, 2022 at 09:33:35PM +0200, David Hildenbrand wrote:
>>>> On 20.07.22 21:15, Peter Xu wrote:
>>>>> On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
>>>>>> For pagecache pages it may as well be *plain wrong* to bypass the write
>>>>>> fault handler and simply mark pages dirty+map them writable.
>>>>>
>>>>> Could you elaborate?
>>>>
>>>> Write-fault handling for some filesystems (that even require this
>>>> "slow path") is a bit special.
>>>>
>>>> For example, do_shared_fault() might have to call page_mkwrite().
>>>>
>>>> AFAIK file systems use that for lazy allocation of disk blocks.
>>>> If you simply go ahead and map a !dirty pagecache page writable
>>>> and mark it dirty, it will not trigger page_mkwrite() and you might
>>>> end up corrupting data.
>>>>
>>>> That's why we the old change_pte_range() code never touched
>>>> anything if the pte wasn't already dirty.
>>>
>>> I don't think that pte_dirty() check was for the pagecache code. For any fs
>>> that has page_mkwrite() defined, it'll already have vma_wants_writenotify()
>>> return 1, so we'll never try to add write bit, hence we'll never even try
>>> to check pte_dirty().
>>
>> I might be too tired, but the whole reason we had this magic before my
>> commit in place was only for the pagecache.
>>
>> With vma_wants_writenotify()=0 you can directly map the pages writable
>> and don't have to do these advanced checks here. In a writable
>> MAP_SHARED VMA you'll already have pte_write().
>>
>> We only get !pte_write() in case we have vma_wants_writenotify()=1 ...
>>
>>  try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
>>
>> and that's the code that checked the dirty bit after all to decide --
>> amongst other things -- if we can simply map it writable without going
>> via the write fault handler and triggering do_shared_fault() .
>>
>> See crazy/ugly FOLL_FORCE code in GUP that similarly checks the dirty bit.
> 
> I thought you want to get rid of it at least for anonymous pages. No?

Yes. Especially for any MAP_PRIVATE mappings.

If you want to write to something that's not mapped writable in a
MAP_PRIVATE mapping it
a) Has to be an exclusive anonymous page
b) The pte has to be dirty

In any other case, you clearly missed to COW or the modifications might
get lost if the PTE is not dirty.

MAP_SHARED is a bit more involved. But whether the pte is dirty might be
good enough ... but this needs a lot more care.

> 
>>
>> But yeah, it's all confusing so I might just be wrong regarding
>> pagecache pages.
> 
> Just to note: I am not very courageous and I did not intend to change
> condition for when non-anonymous pages are set as writable. That’s the
> reason I did not change the dirty for non-writable non-anonymous entries (as
> Peter said). And that’s the reason that setting the dirty bit (at least as I
> should have done it) is only performed after we made the decision on the
> write-bit.

Good. As long as we stick to anonymous pages I roughly know what we we
can and cannot do at this point :)


The problem I see is that detection whether we can write requires the
dirty bit ... and whether to set the dirty bit requires the information
whether we can write.

Again, for anonymous pages we should be able to relax the "dirty"
requirement when detecting whether we can write.

> 
> IOW, after you made your decision about the write-bit, then and only then
> you may be able to set the dirty bit for writable entries. Since the entry
> is already writeable (i.e., can be written without a fault later directly
> from userspace), there should be no concern of correctness when you set it.

That makes sense to me. What keeps confusing me are architectures with
and without a hw-managed dirty bit ... :)
Nadav Amit July 20, 2022, 8:56 p.m. UTC | #16
On Jul 20, 2022, at 1:38 PM, David Hildenbrand <david@redhat.com> wrote:

> On 20.07.22 22:22, Nadav Amit wrote:
>> On Jul 20, 2022, at 12:55 PM, David Hildenbrand <david@redhat.com> wrote:
>> 
>>> On 20.07.22 21:48, Peter Xu wrote:
>>>> On Wed, Jul 20, 2022 at 09:33:35PM +0200, David Hildenbrand wrote:
>>>>> On 20.07.22 21:15, Peter Xu wrote:
>>>>>> On Wed, Jul 20, 2022 at 05:10:37PM +0200, David Hildenbrand wrote:
>>>>>>> For pagecache pages it may as well be *plain wrong* to bypass the write
>>>>>>> fault handler and simply mark pages dirty+map them writable.
>>>>>> 
>>>>>> Could you elaborate?
>>>>> 
>>>>> Write-fault handling for some filesystems (that even require this
>>>>> "slow path") is a bit special.
>>>>> 
>>>>> For example, do_shared_fault() might have to call page_mkwrite().
>>>>> 
>>>>> AFAIK file systems use that for lazy allocation of disk blocks.
>>>>> If you simply go ahead and map a !dirty pagecache page writable
>>>>> and mark it dirty, it will not trigger page_mkwrite() and you might
>>>>> end up corrupting data.
>>>>> 
>>>>> That's why we the old change_pte_range() code never touched
>>>>> anything if the pte wasn't already dirty.
>>>> 
>>>> I don't think that pte_dirty() check was for the pagecache code. For any fs
>>>> that has page_mkwrite() defined, it'll already have vma_wants_writenotify()
>>>> return 1, so we'll never try to add write bit, hence we'll never even try
>>>> to check pte_dirty().
>>> 
>>> I might be too tired, but the whole reason we had this magic before my
>>> commit in place was only for the pagecache.
>>> 
>>> With vma_wants_writenotify()=0 you can directly map the pages writable
>>> and don't have to do these advanced checks here. In a writable
>>> MAP_SHARED VMA you'll already have pte_write().
>>> 
>>> We only get !pte_write() in case we have vma_wants_writenotify()=1 ...
>>> 
>>> try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
>>> 
>>> and that's the code that checked the dirty bit after all to decide --
>>> amongst other things -- if we can simply map it writable without going
>>> via the write fault handler and triggering do_shared_fault() .
>>> 
>>> See crazy/ugly FOLL_FORCE code in GUP that similarly checks the dirty bit.
>> 
>> I thought you want to get rid of it at least for anonymous pages. No?
> 
> Yes. Especially for any MAP_PRIVATE mappings.
> 
> If you want to write to something that's not mapped writable in a
> MAP_PRIVATE mapping it
> a) Has to be an exclusive anonymous page
> b) The pte has to be dirty

Do you need both conditions to be true? I thought (a) is sufficient (if
the soft-dirty and related checks succeed).

> 
> In any other case, you clearly missed to COW or the modifications might
> get lost if the PTE is not dirty.
> 
> MAP_SHARED is a bit more involved. But whether the pte is dirty might be
> good enough ... but this needs a lot more care.
> 
>>> But yeah, it's all confusing so I might just be wrong regarding
>>> pagecache pages.
>> 
>> Just to note: I am not very courageous and I did not intend to change
>> condition for when non-anonymous pages are set as writable. That’s the
>> reason I did not change the dirty for non-writable non-anonymous entries (as
>> Peter said). And that’s the reason that setting the dirty bit (at least as I
>> should have done it) is only performed after we made the decision on the
>> write-bit.
> 
> Good. As long as we stick to anonymous pages I roughly know what we we
> can and cannot do at this point :)
> 
> 
> The problem I see is that detection whether we can write requires the
> dirty bit ... and whether to set the dirty bit requires the information
> whether we can write.
> 
> Again, for anonymous pages we should be able to relax the "dirty"
> requirement when detecting whether we can write.

That’s all I wanted to do there.

> 
>> IOW, after you made your decision about the write-bit, then and only then
>> you may be able to set the dirty bit for writable entries. Since the entry
>> is already writeable (i.e., can be written without a fault later directly
>> from userspace), there should be no concern of correctness when you set it.
> 
> That makes sense to me. What keeps confusing me are architectures with
> and without a hw-managed dirty bit ... :)

I don’t know which arch you have in your mind. But the moment a PTE is
writable, then marking it logically/architecturally as dirty should be
fine.

But… if the Exclusive check is not good enough for private+anon without
the “logical” dirty bit, then there would be a problem.
David Hildenbrand July 21, 2022, 7:52 a.m. UTC | #17
>> Yes. Especially for any MAP_PRIVATE mappings.
>>
>> If you want to write to something that's not mapped writable in a
>> MAP_PRIVATE mapping it
>> a) Has to be an exclusive anonymous page
>> b) The pte has to be dirty
> 
> Do you need both conditions to be true? I thought (a) is sufficient (if
> the soft-dirty and related checks succeed).

If we force-write to a page, we need it to be dirty to tell reclaim code
that the content stale. We can either mark the pte dirty manually, or
just let the write fault handler deal with it to simplify GUP code. This
needs some more thought, but that's my understanding.

> 
>>
>> In any other case, you clearly missed to COW or the modifications might
>> get lost if the PTE is not dirty.
>>
>> MAP_SHARED is a bit more involved. But whether the pte is dirty might be
>> good enough ... but this needs a lot more care.
>>
>>>> But yeah, it's all confusing so I might just be wrong regarding
>>>> pagecache pages.
>>>
>>> Just to note: I am not very courageous and I did not intend to change
>>> condition for when non-anonymous pages are set as writable. That’s the
>>> reason I did not change the dirty for non-writable non-anonymous entries (as
>>> Peter said). And that’s the reason that setting the dirty bit (at least as I
>>> should have done it) is only performed after we made the decision on the
>>> write-bit.
>>
>> Good. As long as we stick to anonymous pages I roughly know what we we
>> can and cannot do at this point :)
>>
>>
>> The problem I see is that detection whether we can write requires the
>> dirty bit ... and whether to set the dirty bit requires the information
>> whether we can write.
>>
>> Again, for anonymous pages we should be able to relax the "dirty"
>> requirement when detecting whether we can write.
> 
> That’s all I wanted to do there.
> 
>>
>>> IOW, after you made your decision about the write-bit, then and only then
>>> you may be able to set the dirty bit for writable entries. Since the entry
>>> is already writeable (i.e., can be written without a fault later directly
>>> from userspace), there should be no concern of correctness when you set it.
>>
>> That makes sense to me. What keeps confusing me are architectures with
>> and without a hw-managed dirty bit ... :)
> 
> I don’t know which arch you have in your mind. But the moment a PTE is
> writable, then marking it logically/architecturally as dirty should be
> fine.
> 
> But… if the Exclusive check is not good enough for private+anon without
> the “logical” dirty bit, then there would be a problem. 

I think we are good for anonymous pages.
David Hildenbrand July 21, 2022, 2:10 p.m. UTC | #18
On 21.07.22 09:52, David Hildenbrand wrote:
>>> Yes. Especially for any MAP_PRIVATE mappings.
>>>
>>> If you want to write to something that's not mapped writable in a
>>> MAP_PRIVATE mapping it
>>> a) Has to be an exclusive anonymous page
>>> b) The pte has to be dirty
>>
>> Do you need both conditions to be true? I thought (a) is sufficient (if
>> the soft-dirty and related checks succeed).
> 
> If we force-write to a page, we need it to be dirty to tell reclaim code
> that the content stale. We can either mark the pte dirty manually, or
> just let the write fault handler deal with it to simplify GUP code. This
> needs some more thought, but that's my understanding.

Extending on my previous answer after staring at the code

a) I have to dig if the FOLL_FORCE special-retry-handling is required
for MAP_SHARED at all.

check_vma_flags() allows FOLL_FORCE only on MAP_PRIVATE VMAs that lack
VM_WRITE.

Consequently, I would have assumed that the first write fault should be
sufficient on a MAP_SHARED VMA to actually map the PTE writable and not
require any of that special retry magic.


b) I wonder if we have to take care of uffd-wp and softdirty (just like
in mprotect code here) as well in case we stumble over an exclusive
anonymous page. Yes, the VMA is currently not writable, but I'd have
expected at least softdirty tracking to apply.

... I'll dig into the details.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9cc02a7e503b..4afd75ce5875 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1988,6 +1988,8 @@  extern unsigned long move_page_tables(struct vm_area_struct *vma,
 /* Whether this change is for write protecting */
 #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
+/* Whether to try to mark entries as dirty as they are to be written */
+#define  MM_CP_WILL_NEED		   (1UL << 4)
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
 					    MM_CP_UFFD_WP_RESOLVE)
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 996a97e213ad..34c2dfb68c42 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -82,6 +82,7 @@  static unsigned long change_pte_range(struct mmu_gather *tlb,
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+	bool will_need = cp_flags & MM_CP_WILL_NEED;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 
@@ -172,6 +173,9 @@  static unsigned long change_pte_range(struct mmu_gather *tlb,
 				ptent = pte_clear_uffd_wp(ptent);
 			}
 
+			if (will_need)
+				ptent = pte_mkyoung(ptent);
+
 			/*
 			 * In some writable, shared mappings, we might want
 			 * to catch actual write access -- see
@@ -187,8 +191,11 @@  static unsigned long change_pte_range(struct mmu_gather *tlb,
 			 */
 			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
 			    !pte_write(ptent) &&
-			    can_change_pte_writable(vma, addr, ptent))
+			    can_change_pte_writable(vma, addr, ptent)) {
 				ptent = pte_mkwrite(ptent);
+				if (will_need)
+					ptent = pte_mkdirty(ptent);
+			}
 
 			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
 			if (pte_needs_flush(oldpte, ptent))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 954c6980b29f..e0492f5f06a0 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -749,6 +749,7 @@  int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 	bool enable_wp = uffd_flags & UFFD_FLAGS_WP;
 	struct vm_area_struct *dst_vma;
 	unsigned long page_mask;
+	unsigned long cp_flags;
 	struct mmu_gather tlb;
 	pgprot_t newprot;
 	int err;
@@ -795,9 +796,12 @@  int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 	else
 		newprot = vm_get_page_prot(dst_vma->vm_flags);
 
+	cp_flags = enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE;
+	if (uffd_flags & (UFFD_FLAGS_ACCESS_LIKELY|UFFD_FLAGS_WRITE_LIKELY))
+		cp_flags |= MM_CP_WILL_NEED;
+
 	tlb_gather_mmu(&tlb, dst_mm);
-	change_protection(&tlb, dst_vma, start, start + len, newprot,
-			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+	change_protection(&tlb, dst_vma, start, start + len, newprot, cp_flags);
 	tlb_finish_mmu(&tlb);
 
 	err = 0;