diff mbox series

[v3,14/28] userfaultfd: wp: handle COW properly for uffd-wp

Message ID 20190320020642.4000-15-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu March 20, 2019, 2:06 a.m. UTC
This allows uffd-wp to support write-protected pages for COW.

For example, the uffd write-protected PTE could also be write-protected
by other usages like COW or zero pages.  When that happens, we can't
simply set the write bit in the PTE since otherwise it'll change the
content of every single reference to the page.  Instead, we should do
the COW first if necessary, then handle the uffd-wp fault.

To correctly copy the page, we'll also need to carry over the
_PAGE_UFFD_WP bit if it was set in the original PTE.

For huge PMDs, we just simply split the huge PMDs where we want to
resolve an uffd-wp page fault always.  That matches what we do with
general huge PMD write protections.  In that way, we resolved the huge
PMD copy-on-write issue into PTE copy-on-write.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 mm/memory.c   |  5 +++-
 mm/mprotect.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 65 insertions(+), 4 deletions(-)

Comments

Jerome Glisse April 18, 2019, 8:51 p.m. UTC | #1
On Wed, Mar 20, 2019 at 10:06:28AM +0800, Peter Xu wrote:
> This allows uffd-wp to support write-protected pages for COW.
> 
> For example, the uffd write-protected PTE could also be write-protected
> by other usages like COW or zero pages.  When that happens, we can't
> simply set the write bit in the PTE since otherwise it'll change the
> content of every single reference to the page.  Instead, we should do
> the COW first if necessary, then handle the uffd-wp fault.
> 
> To correctly copy the page, we'll also need to carry over the
> _PAGE_UFFD_WP bit if it was set in the original PTE.
> 
> For huge PMDs, we just simply split the huge PMDs where we want to
> resolve an uffd-wp page fault always.  That matches what we do with
> general huge PMD write protections.  In that way, we resolved the huge
> PMD copy-on-write issue into PTE copy-on-write.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

This one has a bug see below.


> ---
>  mm/memory.c   |  5 +++-
>  mm/mprotect.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 65 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index e7a4b9650225..b8a4c0bab461 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2291,7 +2291,10 @@ vm_fault_t wp_page_copy(struct vm_fault *vmf)
>  		}
>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		if (pte_uffd_wp(vmf->orig_pte))
> +			entry = pte_mkuffd_wp(entry);
> +		else
> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		/*
>  		 * Clear the pte entry and flush it first, before updating the
>  		 * pte with the new entry. This will avoid a race condition
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 9d4433044c21..855dddb07ff2 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -73,18 +73,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  	flush_tlb_batched_pending(vma->vm_mm);
>  	arch_enter_lazy_mmu_mode();
>  	do {
> +retry_pte:
>  		oldpte = *pte;
>  		if (pte_present(oldpte)) {
>  			pte_t ptent;
>  			bool preserve_write = prot_numa && pte_write(oldpte);
> +			struct page *page;
>  
>  			/*
>  			 * Avoid trapping faults against the zero or KSM
>  			 * pages. See similar comment in change_huge_pmd.
>  			 */
>  			if (prot_numa) {
> -				struct page *page;
> -
>  				page = vm_normal_page(vma, addr, oldpte);
>  				if (!page || PageKsm(page))
>  					continue;
> @@ -114,6 +114,54 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  					continue;
>  			}
>  
> +			/*
> +			 * Detect whether we'll need to COW before
> +			 * resolving an uffd-wp fault.  Note that this
> +			 * includes detection of the zero page (where
> +			 * page==NULL)
> +			 */
> +			if (uffd_wp_resolve) {
> +				/* If the fault is resolved already, skip */
> +				if (!pte_uffd_wp(*pte))
> +					continue;
> +				page = vm_normal_page(vma, addr, oldpte);
> +				if (!page || page_mapcount(page) > 1) {
> +					struct vm_fault vmf = {
> +						.vma = vma,
> +						.address = addr & PAGE_MASK,
> +						.page = page,
> +						.orig_pte = oldpte,
> +						.pmd = pmd,
> +						/* pte and ptl not needed */
> +					};
> +					vm_fault_t ret;
> +
> +					if (page)
> +						get_page(page);
> +					arch_leave_lazy_mmu_mode();
> +					pte_unmap_unlock(pte, ptl);
> +					ret = wp_page_copy(&vmf);
> +					/* PTE is changed, or OOM */
> +					if (ret == 0)
> +						/* It's done by others */
> +						continue;

This is wrong if ret == 0 you still need to remap the pte before
continuing as otherwise you will go to next pte without the page
table lock for the directory. So 0 case must be handled after
arch_enter_lazy_mmu_mode() below.

Sorry i should have catch that in previous review.


> +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> +						return pages;
> +					pte = pte_offset_map_lock(vma->vm_mm,
> +								  pmd, addr,
> +								  &ptl);
> +					arch_enter_lazy_mmu_mode();
> +					if (!pte_present(*pte))
> +						/*
> +						 * This PTE could have been
> +						 * modified after COW
> +						 * before we have taken the
> +						 * lock; retry this PTE
> +						 */
> +						goto retry_pte;
> +				}
> +			}
> +
>  			ptent = ptep_modify_prot_start(mm, addr, pte);
>  			ptent = pte_modify(ptent, newprot);
>  			if (preserve_write)

>  	unsigned long pages = 0;
>  	unsigned long nr_huge_updates = 0;
>  	struct mmu_notifier_range range;
> +	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>  
>  	range.start = 0;
>  
> @@ -202,7 +251,16 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>  		}
>  
>  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> -			if (next - addr != HPAGE_PMD_SIZE) {
> +			/*
> +			 * When resolving an userfaultfd write
> +			 * protection fault, it's not easy to identify
> +			 * whether a THP is shared with others and
> +			 * whether we'll need to do copy-on-write, so
> +			 * just split it always for now to simply the
> +			 * procedure.  And that's the policy too for
> +			 * general THP write-protect in af9e4d5f2de2.
> +			 */
> +			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {

Just a nit pick can you please add () to next - addr ie:
if ((next - addr) != HPAGE_PMD_SIZE || uffd_wp_resolve) {

I know it is not needed but each time i bump into this i
have to scratch my head for second to remember the operator
rules :)

>  				__split_huge_pmd(vma, pmd, addr, false, NULL);
>  			} else {
>  				int nr_ptes = change_huge_pmd(vma, pmd, addr,
> -- 
> 2.17.1
>
Peter Xu April 19, 2019, 6:26 a.m. UTC | #2
On Thu, Apr 18, 2019 at 04:51:15PM -0400, Jerome Glisse wrote:
> On Wed, Mar 20, 2019 at 10:06:28AM +0800, Peter Xu wrote:
> > This allows uffd-wp to support write-protected pages for COW.
> > 
> > For example, the uffd write-protected PTE could also be write-protected
> > by other usages like COW or zero pages.  When that happens, we can't
> > simply set the write bit in the PTE since otherwise it'll change the
> > content of every single reference to the page.  Instead, we should do
> > the COW first if necessary, then handle the uffd-wp fault.
> > 
> > To correctly copy the page, we'll also need to carry over the
> > _PAGE_UFFD_WP bit if it was set in the original PTE.
> > 
> > For huge PMDs, we just simply split the huge PMDs where we want to
> > resolve an uffd-wp page fault always.  That matches what we do with
> > general huge PMD write protections.  In that way, we resolved the huge
> > PMD copy-on-write issue into PTE copy-on-write.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> This one has a bug see below.
> 
> 
> > ---
> >  mm/memory.c   |  5 +++-
> >  mm/mprotect.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  2 files changed, 65 insertions(+), 4 deletions(-)
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e7a4b9650225..b8a4c0bab461 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2291,7 +2291,10 @@ vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >  		}
> >  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> >  		entry = mk_pte(new_page, vma->vm_page_prot);
> > -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > +		if (pte_uffd_wp(vmf->orig_pte))
> > +			entry = pte_mkuffd_wp(entry);
> > +		else
> > +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >  		/*
> >  		 * Clear the pte entry and flush it first, before updating the
> >  		 * pte with the new entry. This will avoid a race condition
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 9d4433044c21..855dddb07ff2 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -73,18 +73,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >  	flush_tlb_batched_pending(vma->vm_mm);
> >  	arch_enter_lazy_mmu_mode();
> >  	do {
> > +retry_pte:
> >  		oldpte = *pte;
> >  		if (pte_present(oldpte)) {
> >  			pte_t ptent;
> >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > +			struct page *page;
> >  
> >  			/*
> >  			 * Avoid trapping faults against the zero or KSM
> >  			 * pages. See similar comment in change_huge_pmd.
> >  			 */
> >  			if (prot_numa) {
> > -				struct page *page;
> > -
> >  				page = vm_normal_page(vma, addr, oldpte);
> >  				if (!page || PageKsm(page))
> >  					continue;
> > @@ -114,6 +114,54 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >  					continue;
> >  			}
> >  
> > +			/*
> > +			 * Detect whether we'll need to COW before
> > +			 * resolving an uffd-wp fault.  Note that this
> > +			 * includes detection of the zero page (where
> > +			 * page==NULL)
> > +			 */
> > +			if (uffd_wp_resolve) {
> > +				/* If the fault is resolved already, skip */
> > +				if (!pte_uffd_wp(*pte))
> > +					continue;
> > +				page = vm_normal_page(vma, addr, oldpte);
> > +				if (!page || page_mapcount(page) > 1) {
> > +					struct vm_fault vmf = {
> > +						.vma = vma,
> > +						.address = addr & PAGE_MASK,
> > +						.page = page,
> > +						.orig_pte = oldpte,
> > +						.pmd = pmd,
> > +						/* pte and ptl not needed */
> > +					};
> > +					vm_fault_t ret;
> > +
> > +					if (page)
> > +						get_page(page);
> > +					arch_leave_lazy_mmu_mode();
> > +					pte_unmap_unlock(pte, ptl);
> > +					ret = wp_page_copy(&vmf);
> > +					/* PTE is changed, or OOM */
> > +					if (ret == 0)
> > +						/* It's done by others */
> > +						continue;
> 
> This is wrong if ret == 0 you still need to remap the pte before
> continuing as otherwise you will go to next pte without the page
> table lock for the directory. So 0 case must be handled after
> arch_enter_lazy_mmu_mode() below.
> 
> Sorry i should have catch that in previous review.

My fault to not have noticed it since the very beginning... thanks for
spotting that.

I'm squashing below changes into the patch:

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3cddfd6627b8..13d493b836bb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -141,22 +141,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                        arch_leave_lazy_mmu_mode();
                                        pte_unmap_unlock(pte, ptl);
                                        ret = wp_page_copy(&vmf);
-                                       /* PTE is changed, or OOM */
-                                       if (ret == 0)
-                                               /* It's done by others */
-                                               continue;
-                                       else if (WARN_ON(ret != VM_FAULT_WRITE))
+                                       if (ret != VM_FAULT_WRITE && ret != 0)
+                                               /* Probably OOM */
                                                return pages;
                                        pte = pte_offset_map_lock(vma->vm_mm,
                                                                  pmd, addr,
                                                                  &ptl);
                                        arch_enter_lazy_mmu_mode();
-                                       if (!pte_present(*pte))
+                                       if (ret == 0 || !pte_present(*pte))
                                                /*
                                                 * This PTE could have been
-                                                * modified after COW
-                                                * before we have taken the
-                                                * lock; retry this PTE
+                                                * modified during or after
+                                                * COW before take the lock;
+                                                * retry.
                                                 */
                                                goto retry_pte;
                                }

[...]

> >  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> > -			if (next - addr != HPAGE_PMD_SIZE) {
> > +			/*
> > +			 * When resolving an userfaultfd write
> > +			 * protection fault, it's not easy to identify
> > +			 * whether a THP is shared with others and
> > +			 * whether we'll need to do copy-on-write, so
> > +			 * just split it always for now to simply the
> > +			 * procedure.  And that's the policy too for
> > +			 * general THP write-protect in af9e4d5f2de2.
> > +			 */
> > +			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {
> 
> Just a nit pick can you please add () to next - addr ie:
> if ((next - addr) != HPAGE_PMD_SIZE || uffd_wp_resolve) {
> 
> I know it is not needed but each time i bump into this i
> have to scratch my head for second to remember the operator
> rules :)

Sure, as usual. :) And I tend to agree it's a good habit.  It's just
me that always forgot about it.

Thanks,
Jerome Glisse April 19, 2019, 3:02 p.m. UTC | #3
On Fri, Apr 19, 2019 at 02:26:50PM +0800, Peter Xu wrote:
> On Thu, Apr 18, 2019 at 04:51:15PM -0400, Jerome Glisse wrote:
> > On Wed, Mar 20, 2019 at 10:06:28AM +0800, Peter Xu wrote:
> > > This allows uffd-wp to support write-protected pages for COW.
> > > 
> > > For example, the uffd write-protected PTE could also be write-protected
> > > by other usages like COW or zero pages.  When that happens, we can't
> > > simply set the write bit in the PTE since otherwise it'll change the
> > > content of every single reference to the page.  Instead, we should do
> > > the COW first if necessary, then handle the uffd-wp fault.
> > > 
> > > To correctly copy the page, we'll also need to carry over the
> > > _PAGE_UFFD_WP bit if it was set in the original PTE.
> > > 
> > > For huge PMDs, we just simply split the huge PMDs where we want to
> > > resolve an uffd-wp page fault always.  That matches what we do with
> > > general huge PMD write protections.  In that way, we resolved the huge
> > > PMD copy-on-write issue into PTE copy-on-write.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > 
> > This one has a bug see below.
> > 
> > 
> > > ---
> > >  mm/memory.c   |  5 +++-
> > >  mm/mprotect.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  2 files changed, 65 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index e7a4b9650225..b8a4c0bab461 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -2291,7 +2291,10 @@ vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > >  		}
> > >  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> > >  		entry = mk_pte(new_page, vma->vm_page_prot);
> > > -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > > +		if (pte_uffd_wp(vmf->orig_pte))
> > > +			entry = pte_mkuffd_wp(entry);
> > > +		else
> > > +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > >  		/*
> > >  		 * Clear the pte entry and flush it first, before updating the
> > >  		 * pte with the new entry. This will avoid a race condition
> > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > index 9d4433044c21..855dddb07ff2 100644
> > > --- a/mm/mprotect.c
> > > +++ b/mm/mprotect.c
> > > @@ -73,18 +73,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > >  	flush_tlb_batched_pending(vma->vm_mm);
> > >  	arch_enter_lazy_mmu_mode();
> > >  	do {
> > > +retry_pte:
> > >  		oldpte = *pte;
> > >  		if (pte_present(oldpte)) {
> > >  			pte_t ptent;
> > >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > > +			struct page *page;
> > >  
> > >  			/*
> > >  			 * Avoid trapping faults against the zero or KSM
> > >  			 * pages. See similar comment in change_huge_pmd.
> > >  			 */
> > >  			if (prot_numa) {
> > > -				struct page *page;
> > > -
> > >  				page = vm_normal_page(vma, addr, oldpte);
> > >  				if (!page || PageKsm(page))
> > >  					continue;
> > > @@ -114,6 +114,54 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > >  					continue;
> > >  			}
> > >  
> > > +			/*
> > > +			 * Detect whether we'll need to COW before
> > > +			 * resolving an uffd-wp fault.  Note that this
> > > +			 * includes detection of the zero page (where
> > > +			 * page==NULL)
> > > +			 */
> > > +			if (uffd_wp_resolve) {
> > > +				/* If the fault is resolved already, skip */
> > > +				if (!pte_uffd_wp(*pte))
> > > +					continue;
> > > +				page = vm_normal_page(vma, addr, oldpte);
> > > +				if (!page || page_mapcount(page) > 1) {
> > > +					struct vm_fault vmf = {
> > > +						.vma = vma,
> > > +						.address = addr & PAGE_MASK,
> > > +						.page = page,
> > > +						.orig_pte = oldpte,
> > > +						.pmd = pmd,
> > > +						/* pte and ptl not needed */
> > > +					};
> > > +					vm_fault_t ret;
> > > +
> > > +					if (page)
> > > +						get_page(page);
> > > +					arch_leave_lazy_mmu_mode();
> > > +					pte_unmap_unlock(pte, ptl);
> > > +					ret = wp_page_copy(&vmf);
> > > +					/* PTE is changed, or OOM */
> > > +					if (ret == 0)
> > > +						/* It's done by others */
> > > +						continue;
> > 
> > This is wrong if ret == 0 you still need to remap the pte before
> > continuing as otherwise you will go to next pte without the page
> > table lock for the directory. So 0 case must be handled after
> > arch_enter_lazy_mmu_mode() below.
> > 
> > Sorry i should have catch that in previous review.
> 
> My fault to not have noticed it since the very beginning... thanks for
> spotting that.
> 
> I'm squashing below changes into the patch:


Well thinking of this some more i think you should use do_wp_page() and
not wp_page_copy() it would avoid bunch of code above and also you are
not properly handling KSM page or page in the swap cache. Instead of
duplicating same code that is in do_wp_page() it would be better to call
it here.


> 
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 3cddfd6627b8..13d493b836bb 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -141,22 +141,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                                         arch_leave_lazy_mmu_mode();
>                                         pte_unmap_unlock(pte, ptl);
>                                         ret = wp_page_copy(&vmf);
> -                                       /* PTE is changed, or OOM */
> -                                       if (ret == 0)
> -                                               /* It's done by others */
> -                                               continue;
> -                                       else if (WARN_ON(ret != VM_FAULT_WRITE))
> +                                       if (ret != VM_FAULT_WRITE && ret != 0)
> +                                               /* Probably OOM */
>                                                 return pages;
>                                         pte = pte_offset_map_lock(vma->vm_mm,
>                                                                   pmd, addr,
>                                                                   &ptl);
>                                         arch_enter_lazy_mmu_mode();
> -                                       if (!pte_present(*pte))
> +                                       if (ret == 0 || !pte_present(*pte))
>                                                 /*
>                                                  * This PTE could have been
> -                                                * modified after COW
> -                                                * before we have taken the
> -                                                * lock; retry this PTE
> +                                                * modified during or after
> +                                                * COW before take the lock;
> +                                                * retry.
>                                                  */
>                                                 goto retry_pte;
>                                 }
> 
> [...]
> 
> > >  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> > > -			if (next - addr != HPAGE_PMD_SIZE) {
> > > +			/*
> > > +			 * When resolving an userfaultfd write
> > > +			 * protection fault, it's not easy to identify
> > > +			 * whether a THP is shared with others and
> > > +			 * whether we'll need to do copy-on-write, so
> > > +			 * just split it always for now to simply the
> > > +			 * procedure.  And that's the policy too for
> > > +			 * general THP write-protect in af9e4d5f2de2.
> > > +			 */
> > > +			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {
> > 
> > Just a nit pick can you please add () to next - addr ie:
> > if ((next - addr) != HPAGE_PMD_SIZE || uffd_wp_resolve) {
> > 
> > I know it is not needed but each time i bump into this i
> > have to scratch my head for second to remember the operator
> > rules :)
> 
> Sure, as usual. :) And I tend to agree it's a good habit.  It's just
> me that always forgot about it.
> 
> Thanks,
> 
> -- 
> Peter Xu
Peter Xu April 22, 2019, 12:20 p.m. UTC | #4
On Fri, Apr 19, 2019 at 11:02:53AM -0400, Jerome Glisse wrote:

[...]

> > > > +			if (uffd_wp_resolve) {
> > > > +				/* If the fault is resolved already, skip */
> > > > +				if (!pte_uffd_wp(*pte))
> > > > +					continue;
> > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > +				if (!page || page_mapcount(page) > 1) {
> > > > +					struct vm_fault vmf = {
> > > > +						.vma = vma,
> > > > +						.address = addr & PAGE_MASK,
> > > > +						.page = page,
> > > > +						.orig_pte = oldpte,
> > > > +						.pmd = pmd,
> > > > +						/* pte and ptl not needed */
> > > > +					};
> > > > +					vm_fault_t ret;
> > > > +
> > > > +					if (page)
> > > > +						get_page(page);
> > > > +					arch_leave_lazy_mmu_mode();
> > > > +					pte_unmap_unlock(pte, ptl);
> > > > +					ret = wp_page_copy(&vmf);
> > > > +					/* PTE is changed, or OOM */
> > > > +					if (ret == 0)
> > > > +						/* It's done by others */
> > > > +						continue;
> > > 
> > > This is wrong if ret == 0 you still need to remap the pte before
> > > continuing as otherwise you will go to next pte without the page
> > > table lock for the directory. So 0 case must be handled after
> > > arch_enter_lazy_mmu_mode() below.
> > > 
> > > Sorry i should have catch that in previous review.
> > 
> > My fault to not have noticed it since the very beginning... thanks for
> > spotting that.
> > 
> > I'm squashing below changes into the patch:
> 
> 
> Well thinking of this some more i think you should use do_wp_page() and
> not wp_page_copy() it would avoid bunch of code above and also you are
> not properly handling KSM page or page in the swap cache. Instead of
> duplicating same code that is in do_wp_page() it would be better to call
> it here.

Yeah it makes sense to me.  Then here's my plan:

- I'll need to drop previous patch "export wp_page_copy" since then
  it'll be not needed

- I'll introduce another patch to split current do_wp_page() and
  introduce function "wp_page_copy_cont" (better suggestion on the
  naming would be welcomed) which contains most of the wp handling
  that'll be needed for change_pte_range() in this patch and isolate
  the uffd handling:

static vm_fault_t do_wp_page(struct vm_fault *vmf)
	__releases(vmf->ptl)
{
	struct vm_area_struct *vma = vmf->vma;

	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		return handle_userfault(vmf, VM_UFFD_WP);
	}

	return do_wp_page_cont(vmf);
}

Then I can probably use do_wp_page_cont() in this patch.

Thanks,
Jerome Glisse April 22, 2019, 2:54 p.m. UTC | #5
On Mon, Apr 22, 2019 at 08:20:10PM +0800, Peter Xu wrote:
> On Fri, Apr 19, 2019 at 11:02:53AM -0400, Jerome Glisse wrote:
> 
> [...]
> 
> > > > > +			if (uffd_wp_resolve) {
> > > > > +				/* If the fault is resolved already, skip */
> > > > > +				if (!pte_uffd_wp(*pte))
> > > > > +					continue;
> > > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > > +				if (!page || page_mapcount(page) > 1) {
> > > > > +					struct vm_fault vmf = {
> > > > > +						.vma = vma,
> > > > > +						.address = addr & PAGE_MASK,
> > > > > +						.page = page,
> > > > > +						.orig_pte = oldpte,
> > > > > +						.pmd = pmd,
> > > > > +						/* pte and ptl not needed */
> > > > > +					};
> > > > > +					vm_fault_t ret;
> > > > > +
> > > > > +					if (page)
> > > > > +						get_page(page);
> > > > > +					arch_leave_lazy_mmu_mode();
> > > > > +					pte_unmap_unlock(pte, ptl);
> > > > > +					ret = wp_page_copy(&vmf);
> > > > > +					/* PTE is changed, or OOM */
> > > > > +					if (ret == 0)
> > > > > +						/* It's done by others */
> > > > > +						continue;
> > > > 
> > > > This is wrong if ret == 0 you still need to remap the pte before
> > > > continuing as otherwise you will go to next pte without the page
> > > > table lock for the directory. So 0 case must be handled after
> > > > arch_enter_lazy_mmu_mode() below.
> > > > 
> > > > Sorry i should have catch that in previous review.
> > > 
> > > My fault to not have noticed it since the very beginning... thanks for
> > > spotting that.
> > > 
> > > I'm squashing below changes into the patch:
> > 
> > 
> > Well thinking of this some more i think you should use do_wp_page() and
> > not wp_page_copy() it would avoid bunch of code above and also you are
> > not properly handling KSM page or page in the swap cache. Instead of
> > duplicating same code that is in do_wp_page() it would be better to call
> > it here.
> 
> Yeah it makes sense to me.  Then here's my plan:
> 
> - I'll need to drop previous patch "export wp_page_copy" since then
>   it'll be not needed
> 
> - I'll introduce another patch to split current do_wp_page() and
>   introduce function "wp_page_copy_cont" (better suggestion on the
>   naming would be welcomed) which contains most of the wp handling
>   that'll be needed for change_pte_range() in this patch and isolate
>   the uffd handling:
> 
> static vm_fault_t do_wp_page(struct vm_fault *vmf)
> 	__releases(vmf->ptl)
> {
> 	struct vm_area_struct *vma = vmf->vma;
> 
> 	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
> 		pte_unmap_unlock(vmf->pte, vmf->ptl);
> 		return handle_userfault(vmf, VM_UFFD_WP);
> 	}
> 
> 	return do_wp_page_cont(vmf);
> }
> 
> Then I can probably use do_wp_page_cont() in this patch.

Instead i would keep the do_wp_page name and do:
    static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf) {
        ... // what you have above
        return do_wp_page(vmf);
    }

Naming wise i think it would be better to keep do_wp_page() as
is.

Cheers,
Jérôme
Peter Xu April 23, 2019, 3 a.m. UTC | #6
On Mon, Apr 22, 2019 at 10:54:02AM -0400, Jerome Glisse wrote:
> On Mon, Apr 22, 2019 at 08:20:10PM +0800, Peter Xu wrote:
> > On Fri, Apr 19, 2019 at 11:02:53AM -0400, Jerome Glisse wrote:
> > 
> > [...]
> > 
> > > > > > +			if (uffd_wp_resolve) {
> > > > > > +				/* If the fault is resolved already, skip */
> > > > > > +				if (!pte_uffd_wp(*pte))
> > > > > > +					continue;
> > > > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > > > +				if (!page || page_mapcount(page) > 1) {
> > > > > > +					struct vm_fault vmf = {
> > > > > > +						.vma = vma,
> > > > > > +						.address = addr & PAGE_MASK,
> > > > > > +						.page = page,
> > > > > > +						.orig_pte = oldpte,
> > > > > > +						.pmd = pmd,
> > > > > > +						/* pte and ptl not needed */
> > > > > > +					};
> > > > > > +					vm_fault_t ret;
> > > > > > +
> > > > > > +					if (page)
> > > > > > +						get_page(page);
> > > > > > +					arch_leave_lazy_mmu_mode();
> > > > > > +					pte_unmap_unlock(pte, ptl);
> > > > > > +					ret = wp_page_copy(&vmf);
> > > > > > +					/* PTE is changed, or OOM */
> > > > > > +					if (ret == 0)
> > > > > > +						/* It's done by others */
> > > > > > +						continue;
> > > > > 
> > > > > This is wrong if ret == 0 you still need to remap the pte before
> > > > > continuing as otherwise you will go to next pte without the page
> > > > > table lock for the directory. So 0 case must be handled after
> > > > > arch_enter_lazy_mmu_mode() below.
> > > > > 
> > > > > Sorry i should have catch that in previous review.
> > > > 
> > > > My fault to not have noticed it since the very beginning... thanks for
> > > > spotting that.
> > > > 
> > > > I'm squashing below changes into the patch:
> > > 
> > > 
> > > Well thinking of this some more i think you should use do_wp_page() and
> > > not wp_page_copy() it would avoid bunch of code above and also you are
> > > not properly handling KSM page or page in the swap cache. Instead of
> > > duplicating same code that is in do_wp_page() it would be better to call
> > > it here.
> > 
> > Yeah it makes sense to me.  Then here's my plan:
> > 
> > - I'll need to drop previous patch "export wp_page_copy" since then
> >   it'll be not needed
> > 
> > - I'll introduce another patch to split current do_wp_page() and
> >   introduce function "wp_page_copy_cont" (better suggestion on the
> >   naming would be welcomed) which contains most of the wp handling
> >   that'll be needed for change_pte_range() in this patch and isolate
> >   the uffd handling:
> > 
> > static vm_fault_t do_wp_page(struct vm_fault *vmf)
> > 	__releases(vmf->ptl)
> > {
> > 	struct vm_area_struct *vma = vmf->vma;
> > 
> > 	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
> > 		pte_unmap_unlock(vmf->pte, vmf->ptl);
> > 		return handle_userfault(vmf, VM_UFFD_WP);
> > 	}
> > 
> > 	return do_wp_page_cont(vmf);
> > }
> > 
> > Then I can probably use do_wp_page_cont() in this patch.
> 
> Instead i would keep the do_wp_page name and do:
>     static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf) {
>         ... // what you have above
>         return do_wp_page(vmf);
>     }
> 
> Naming wise i think it would be better to keep do_wp_page() as
> is.

In case I misunderstood... what I've proposed will be simply:

diff --git a/mm/memory.c b/mm/memory.c
index 64bd8075f054..ab98a1eb4702 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2497,6 +2497,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                return handle_userfault(vmf, VM_UFFD_WP);
        }

+       return do_wp_page_cont(vmf);
+}
+
+vm_fault_t do_wp_page_cont(struct vm_fault *vmf)
+       __releases(vmf->ptl)
+{
+       struct vm_area_struct *vma = vmf->vma;
+
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*

And the other proposal is:

diff --git a/mm/memory.c b/mm/memory.c
index 64bd8075f054..a73792127553 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2469,6 +2469,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
        return VM_FAULT_WRITE;
 }

+static vm_fault_t do_wp_page(struct vm_fault *vmf);
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -2487,7 +2489,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
  * but allow concurrent faults), with pte both mapped and locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static vm_fault_t do_wp_page(struct vm_fault *vmf)
+static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
 {
        struct vm_area_struct *vma = vmf->vma;
@@ -2497,6 +2499,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                return handle_userfault(vmf, VM_UFFD_WP);
        }

+       return do_wp_page(vmf);
+}
+
+static vm_fault_t do_wp_page(struct vm_fault *vmf)
+       __releases(vmf->ptl)
+{
+       struct vm_area_struct *vma = vmf->vma;
+
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*
@@ -2869,7 +2879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
-               ret |= do_wp_page(vmf);
+               ret |= do_userfaultfd_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
@@ -3831,7 +3841,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                goto unlock;
        if (vmf->flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
-                       return do_wp_page(vmf);
+                       return do_userfaultfd_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);

I would prefer the 1st approach since it not only contains fewer lines
of changes because it does not touch callers, and also the naming in
the 2nd approach can be a bit confusing (calling
do_userfaultfd_wp_page in handle_pte_fault may let people think of an
userfault-only path but actually it covers the general path).  But if
you really like the 2nd one I can use that too.

Thanks,
Jerome Glisse April 23, 2019, 3:34 p.m. UTC | #7
On Tue, Apr 23, 2019 at 11:00:30AM +0800, Peter Xu wrote:
> On Mon, Apr 22, 2019 at 10:54:02AM -0400, Jerome Glisse wrote:
> > On Mon, Apr 22, 2019 at 08:20:10PM +0800, Peter Xu wrote:
> > > On Fri, Apr 19, 2019 at 11:02:53AM -0400, Jerome Glisse wrote:
> > > 
> > > [...]
> > > 
> > > > > > > +			if (uffd_wp_resolve) {
> > > > > > > +				/* If the fault is resolved already, skip */
> > > > > > > +				if (!pte_uffd_wp(*pte))
> > > > > > > +					continue;
> > > > > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > > > > +				if (!page || page_mapcount(page) > 1) {
> > > > > > > +					struct vm_fault vmf = {
> > > > > > > +						.vma = vma,
> > > > > > > +						.address = addr & PAGE_MASK,
> > > > > > > +						.page = page,
> > > > > > > +						.orig_pte = oldpte,
> > > > > > > +						.pmd = pmd,
> > > > > > > +						/* pte and ptl not needed */
> > > > > > > +					};
> > > > > > > +					vm_fault_t ret;
> > > > > > > +
> > > > > > > +					if (page)
> > > > > > > +						get_page(page);
> > > > > > > +					arch_leave_lazy_mmu_mode();
> > > > > > > +					pte_unmap_unlock(pte, ptl);
> > > > > > > +					ret = wp_page_copy(&vmf);
> > > > > > > +					/* PTE is changed, or OOM */
> > > > > > > +					if (ret == 0)
> > > > > > > +						/* It's done by others */
> > > > > > > +						continue;
> > > > > > 
> > > > > > This is wrong if ret == 0 you still need to remap the pte before
> > > > > > continuing as otherwise you will go to next pte without the page
> > > > > > table lock for the directory. So 0 case must be handled after
> > > > > > arch_enter_lazy_mmu_mode() below.
> > > > > > 
> > > > > > Sorry i should have catch that in previous review.
> > > > > 
> > > > > My fault to not have noticed it since the very beginning... thanks for
> > > > > spotting that.
> > > > > 
> > > > > I'm squashing below changes into the patch:
> > > > 
> > > > 
> > > > Well thinking of this some more i think you should use do_wp_page() and
> > > > not wp_page_copy() it would avoid bunch of code above and also you are
> > > > not properly handling KSM page or page in the swap cache. Instead of
> > > > duplicating same code that is in do_wp_page() it would be better to call
> > > > it here.
> > > 
> > > Yeah it makes sense to me.  Then here's my plan:
> > > 
> > > - I'll need to drop previous patch "export wp_page_copy" since then
> > >   it'll be not needed
> > > 
> > > - I'll introduce another patch to split current do_wp_page() and
> > >   introduce function "wp_page_copy_cont" (better suggestion on the
> > >   naming would be welcomed) which contains most of the wp handling
> > >   that'll be needed for change_pte_range() in this patch and isolate
> > >   the uffd handling:
> > > 
> > > static vm_fault_t do_wp_page(struct vm_fault *vmf)
> > > 	__releases(vmf->ptl)
> > > {
> > > 	struct vm_area_struct *vma = vmf->vma;
> > > 
> > > 	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
> > > 		pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > 		return handle_userfault(vmf, VM_UFFD_WP);
> > > 	}
> > > 
> > > 	return do_wp_page_cont(vmf);
> > > }
> > > 
> > > Then I can probably use do_wp_page_cont() in this patch.
> > 
> > Instead i would keep the do_wp_page name and do:
> >     static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf) {
> >         ... // what you have above
> >         return do_wp_page(vmf);
> >     }
> > 
> > Naming wise i think it would be better to keep do_wp_page() as
> > is.
> 
> In case I misunderstood... what I've proposed will be simply:
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 64bd8075f054..ab98a1eb4702 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2497,6 +2497,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>                 return handle_userfault(vmf, VM_UFFD_WP);
>         }
> 
> +       return do_wp_page_cont(vmf);
> +}
> +
> +vm_fault_t do_wp_page_cont(struct vm_fault *vmf)
> +       __releases(vmf->ptl)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +
>         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
>         if (!vmf->page) {
>                 /*
> 
> And the other proposal is:
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 64bd8075f054..a73792127553 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2469,6 +2469,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
>         return VM_FAULT_WRITE;
>  }
> 
> +static vm_fault_t do_wp_page(struct vm_fault *vmf);
> +
>  /*
>   * This routine handles present pages, when users try to write
>   * to a shared page. It is done by copying the page to a new address
> @@ -2487,7 +2489,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
>   * but allow concurrent faults), with pte both mapped and locked.
>   * We return with mmap_sem still held, but pte unmapped and unlocked.
>   */
> -static vm_fault_t do_wp_page(struct vm_fault *vmf)
> +static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf)
>         __releases(vmf->ptl)
>  {
>         struct vm_area_struct *vma = vmf->vma;
> @@ -2497,6 +2499,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>                 return handle_userfault(vmf, VM_UFFD_WP);
>         }
> 
> +       return do_wp_page(vmf);
> +}
> +
> +static vm_fault_t do_wp_page(struct vm_fault *vmf)
> +       __releases(vmf->ptl)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +
>         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
>         if (!vmf->page) {
>                 /*
> @@ -2869,7 +2879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         }
> 
>         if (vmf->flags & FAULT_FLAG_WRITE) {
> -               ret |= do_wp_page(vmf);
> +               ret |= do_userfaultfd_wp_page(vmf);
>                 if (ret & VM_FAULT_ERROR)
>                         ret &= VM_FAULT_ERROR;
>                 goto out;
> @@ -3831,7 +3841,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>                 goto unlock;
>         if (vmf->flags & FAULT_FLAG_WRITE) {
>                 if (!pte_write(entry))
> -                       return do_wp_page(vmf);
> +                       return do_userfaultfd_wp_page(vmf);
>                 entry = pte_mkdirty(entry);
>         }
>         entry = pte_mkyoung(entry);
> 
> I would prefer the 1st approach since it not only contains fewer lines
> of changes because it does not touch callers, and also the naming in
> the 2nd approach can be a bit confusing (calling
> do_userfaultfd_wp_page in handle_pte_fault may let people think of an
> userfault-only path but actually it covers the general path).  But if
> you really like the 2nd one I can use that too.

Maybe move the userfaultfd code to a small helper, call it first in
call site of do_wp_page() and do_wp_page() if it does not fire ie:

bool do_userfaultfd_wp(struct vm_fault *vmf, int ret)
{
    if (handleuserfault) return true;
    return false;
}

then
     if (vmf->flags & FAULT_FLAG_WRITE) {
            if (do_userfaultfd_wp(vmf, tmp)) {
                ret |= tmp;
            } else
                ret |= do_wp_page(vmf);
            if (ret & VM_FAULT_ERROR)
                ret &= VM_FAULT_ERROR;
            goto out;

and:
    if (vmf->flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry)) {
            if (do_userfaultfd_wp(vmf, ret))
                return ret;
            else
                return do_wp_page(vmf);
        }

Cheers,
Jérôme
Peter Xu April 24, 2019, 8:38 a.m. UTC | #8
On Tue, Apr 23, 2019 at 11:34:56AM -0400, Jerome Glisse wrote:
> On Tue, Apr 23, 2019 at 11:00:30AM +0800, Peter Xu wrote:
> > On Mon, Apr 22, 2019 at 10:54:02AM -0400, Jerome Glisse wrote:
> > > On Mon, Apr 22, 2019 at 08:20:10PM +0800, Peter Xu wrote:
> > > > On Fri, Apr 19, 2019 at 11:02:53AM -0400, Jerome Glisse wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > > > > +			if (uffd_wp_resolve) {
> > > > > > > > +				/* If the fault is resolved already, skip */
> > > > > > > > +				if (!pte_uffd_wp(*pte))
> > > > > > > > +					continue;
> > > > > > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > > > > > +				if (!page || page_mapcount(page) > 1) {
> > > > > > > > +					struct vm_fault vmf = {
> > > > > > > > +						.vma = vma,
> > > > > > > > +						.address = addr & PAGE_MASK,
> > > > > > > > +						.page = page,
> > > > > > > > +						.orig_pte = oldpte,
> > > > > > > > +						.pmd = pmd,
> > > > > > > > +						/* pte and ptl not needed */
> > > > > > > > +					};
> > > > > > > > +					vm_fault_t ret;
> > > > > > > > +
> > > > > > > > +					if (page)
> > > > > > > > +						get_page(page);
> > > > > > > > +					arch_leave_lazy_mmu_mode();
> > > > > > > > +					pte_unmap_unlock(pte, ptl);
> > > > > > > > +					ret = wp_page_copy(&vmf);
> > > > > > > > +					/* PTE is changed, or OOM */
> > > > > > > > +					if (ret == 0)
> > > > > > > > +						/* It's done by others */
> > > > > > > > +						continue;
> > > > > > > 
> > > > > > > This is wrong if ret == 0 you still need to remap the pte before
> > > > > > > continuing as otherwise you will go to next pte without the page
> > > > > > > table lock for the directory. So 0 case must be handled after
> > > > > > > arch_enter_lazy_mmu_mode() below.
> > > > > > > 
> > > > > > > Sorry i should have catch that in previous review.
> > > > > > 
> > > > > > My fault to not have noticed it since the very beginning... thanks for
> > > > > > spotting that.
> > > > > > 
> > > > > > I'm squashing below changes into the patch:
> > > > > 
> > > > > 
> > > > > Well thinking of this some more i think you should use do_wp_page() and
> > > > > not wp_page_copy() it would avoid bunch of code above and also you are
> > > > > not properly handling KSM page or page in the swap cache. Instead of
> > > > > duplicating same code that is in do_wp_page() it would be better to call
> > > > > it here.
> > > > 
> > > > Yeah it makes sense to me.  Then here's my plan:
> > > > 
> > > > - I'll need to drop previous patch "export wp_page_copy" since then
> > > >   it'll be not needed
> > > > 
> > > > - I'll introduce another patch to split current do_wp_page() and
> > > >   introduce function "wp_page_copy_cont" (better suggestion on the
> > > >   naming would be welcomed) which contains most of the wp handling
> > > >   that'll be needed for change_pte_range() in this patch and isolate
> > > >   the uffd handling:
> > > > 
> > > > static vm_fault_t do_wp_page(struct vm_fault *vmf)
> > > > 	__releases(vmf->ptl)
> > > > {
> > > > 	struct vm_area_struct *vma = vmf->vma;
> > > > 
> > > > 	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
> > > > 		pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > > 		return handle_userfault(vmf, VM_UFFD_WP);
> > > > 	}
> > > > 
> > > > 	return do_wp_page_cont(vmf);
> > > > }
> > > > 
> > > > Then I can probably use do_wp_page_cont() in this patch.
> > > 
> > > Instead i would keep the do_wp_page name and do:
> > >     static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf) {
> > >         ... // what you have above
> > >         return do_wp_page(vmf);
> > >     }
> > > 
> > > Naming wise i think it would be better to keep do_wp_page() as
> > > is.
> > 
> > In case I misunderstood... what I've proposed will be simply:
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 64bd8075f054..ab98a1eb4702 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2497,6 +2497,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
> >                 return handle_userfault(vmf, VM_UFFD_WP);
> >         }
> > 
> > +       return do_wp_page_cont(vmf);
> > +}
> > +
> > +vm_fault_t do_wp_page_cont(struct vm_fault *vmf)
> > +       __releases(vmf->ptl)
> > +{
> > +       struct vm_area_struct *vma = vmf->vma;
> > +
> >         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
> >         if (!vmf->page) {
> >                 /*
> > 
> > And the other proposal is:
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 64bd8075f054..a73792127553 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2469,6 +2469,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> >         return VM_FAULT_WRITE;
> >  }
> > 
> > +static vm_fault_t do_wp_page(struct vm_fault *vmf);
> > +
> >  /*
> >   * This routine handles present pages, when users try to write
> >   * to a shared page. It is done by copying the page to a new address
> > @@ -2487,7 +2489,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> >   * but allow concurrent faults), with pte both mapped and locked.
> >   * We return with mmap_sem still held, but pte unmapped and unlocked.
> >   */
> > -static vm_fault_t do_wp_page(struct vm_fault *vmf)
> > +static vm_fault_t do_userfaultfd_wp_page(struct vm_fault *vmf)
> >         __releases(vmf->ptl)
> >  {
> >         struct vm_area_struct *vma = vmf->vma;
> > @@ -2497,6 +2499,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
> >                 return handle_userfault(vmf, VM_UFFD_WP);
> >         }
> > 
> > +       return do_wp_page(vmf);
> > +}
> > +
> > +static vm_fault_t do_wp_page(struct vm_fault *vmf)
> > +       __releases(vmf->ptl)
> > +{
> > +       struct vm_area_struct *vma = vmf->vma;
> > +
> >         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
> >         if (!vmf->page) {
> >                 /*
> > @@ -2869,7 +2879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >         }
> > 
> >         if (vmf->flags & FAULT_FLAG_WRITE) {
> > -               ret |= do_wp_page(vmf);
> > +               ret |= do_userfaultfd_wp_page(vmf);
> >                 if (ret & VM_FAULT_ERROR)
> >                         ret &= VM_FAULT_ERROR;
> >                 goto out;
> > @@ -3831,7 +3841,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> >                 goto unlock;
> >         if (vmf->flags & FAULT_FLAG_WRITE) {
> >                 if (!pte_write(entry))
> > -                       return do_wp_page(vmf);
> > +                       return do_userfaultfd_wp_page(vmf);
> >                 entry = pte_mkdirty(entry);
> >         }
> >         entry = pte_mkyoung(entry);
> > 
> > I would prefer the 1st approach since it not only contains fewer lines
> > of changes because it does not touch callers, and also the naming in
> > the 2nd approach can be a bit confusing (calling
> > do_userfaultfd_wp_page in handle_pte_fault may let people think of an
> > userfault-only path but actually it covers the general path).  But if
> > you really like the 2nd one I can use that too.
> 
> Maybe move the userfaultfd code to a small helper, call it first in
> call site of do_wp_page() and do_wp_page() if it does not fire ie:
> 
> bool do_userfaultfd_wp(struct vm_fault *vmf, int ret)
> {
>     if (handleuserfault) return true;
>     return false;
> }
> 
> then
>      if (vmf->flags & FAULT_FLAG_WRITE) {
>             if (do_userfaultfd_wp(vmf, tmp)) {
>                 ret |= tmp;
>             } else
>                 ret |= do_wp_page(vmf);
>             if (ret & VM_FAULT_ERROR)
>                 ret &= VM_FAULT_ERROR;
>             goto out;
> 
> and:
>     if (vmf->flags & FAULT_FLAG_WRITE) {
>         if (!pte_write(entry)) {
>             if (do_userfaultfd_wp(vmf, ret))
>                 return ret;
>             else
>                 return do_wp_page(vmf);
>         }

But then we will be duplicating the code patterns somehow? :-/

I'll think them over...  Thanks for all these suggestions!
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index e7a4b9650225..b8a4c0bab461 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2291,7 +2291,10 @@  vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		if (pte_uffd_wp(vmf->orig_pte))
+			entry = pte_mkuffd_wp(entry);
+		else
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
 		 * pte with the new entry. This will avoid a race condition
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9d4433044c21..855dddb07ff2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -73,18 +73,18 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	flush_tlb_batched_pending(vma->vm_mm);
 	arch_enter_lazy_mmu_mode();
 	do {
+retry_pte:
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 			bool preserve_write = prot_numa && pte_write(oldpte);
+			struct page *page;
 
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
 			 */
 			if (prot_numa) {
-				struct page *page;
-
 				page = vm_normal_page(vma, addr, oldpte);
 				if (!page || PageKsm(page))
 					continue;
@@ -114,6 +114,54 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 					continue;
 			}
 
+			/*
+			 * Detect whether we'll need to COW before
+			 * resolving an uffd-wp fault.  Note that this
+			 * includes detection of the zero page (where
+			 * page==NULL)
+			 */
+			if (uffd_wp_resolve) {
+				/* If the fault is resolved already, skip */
+				if (!pte_uffd_wp(*pte))
+					continue;
+				page = vm_normal_page(vma, addr, oldpte);
+				if (!page || page_mapcount(page) > 1) {
+					struct vm_fault vmf = {
+						.vma = vma,
+						.address = addr & PAGE_MASK,
+						.page = page,
+						.orig_pte = oldpte,
+						.pmd = pmd,
+						/* pte and ptl not needed */
+					};
+					vm_fault_t ret;
+
+					if (page)
+						get_page(page);
+					arch_leave_lazy_mmu_mode();
+					pte_unmap_unlock(pte, ptl);
+					ret = wp_page_copy(&vmf);
+					/* PTE is changed, or OOM */
+					if (ret == 0)
+						/* It's done by others */
+						continue;
+					else if (WARN_ON(ret != VM_FAULT_WRITE))
+						return pages;
+					pte = pte_offset_map_lock(vma->vm_mm,
+								  pmd, addr,
+								  &ptl);
+					arch_enter_lazy_mmu_mode();
+					if (!pte_present(*pte))
+						/*
+						 * This PTE could have been
+						 * modified after COW
+						 * before we have taken the
+						 * lock; retry this PTE
+						 */
+						goto retry_pte;
+				}
+			}
+
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
 			if (preserve_write)
@@ -183,6 +231,7 @@  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 	unsigned long pages = 0;
 	unsigned long nr_huge_updates = 0;
 	struct mmu_notifier_range range;
+	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
 	range.start = 0;
 
@@ -202,7 +251,16 @@  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		}
 
 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-			if (next - addr != HPAGE_PMD_SIZE) {
+			/*
+			 * When resolving an userfaultfd write
+			 * protection fault, it's not easy to identify
+			 * whether a THP is shared with others and
+			 * whether we'll need to do copy-on-write, so
+			 * just split it always for now to simply the
+			 * procedure.  And that's the policy too for
+			 * general THP write-protect in af9e4d5f2de2.
+			 */
+			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
 			} else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,