diff mbox series

[v2,14/26] userfaultfd: wp: handle COW properly for uffd-wp

Message ID 20190212025632.28946-15-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Feb. 12, 2019, 2:56 a.m. UTC
This allows uffd-wp to support write-protected pages for COW.

For example, the uffd write-protected PTE could also be write-protected
by other usages like COW or zero pages.  When that happens, we can't
simply set the write bit in the PTE since otherwise it'll change the
content of every single reference to the page.  Instead, we should do
the COW first if necessary, then handle the uffd-wp fault.

To correctly copy the page, we'll also need to carry over the
_PAGE_UFFD_WP bit if it was set in the original PTE.

For huge PMDs, we just simply split the huge PMDs where we want to
resolve an uffd-wp page fault always.  That matches what we do with
general huge PMD write protections.  In that way, we resolved the huge
PMD copy-on-write issue into PTE copy-on-write.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 mm/memory.c   |  2 ++
 mm/mprotect.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 3 deletions(-)

Comments

Jerome Glisse Feb. 21, 2019, 6:04 p.m. UTC | #1
On Tue, Feb 12, 2019 at 10:56:20AM +0800, Peter Xu wrote:
> This allows uffd-wp to support write-protected pages for COW.
> 
> For example, the uffd write-protected PTE could also be write-protected
> by other usages like COW or zero pages.  When that happens, we can't
> simply set the write bit in the PTE since otherwise it'll change the
> content of every single reference to the page.  Instead, we should do
> the COW first if necessary, then handle the uffd-wp fault.
> 
> To correctly copy the page, we'll also need to carry over the
> _PAGE_UFFD_WP bit if it was set in the original PTE.
> 
> For huge PMDs, we just simply split the huge PMDs where we want to
> resolve an uffd-wp page fault always.  That matches what we do with
> general huge PMD write protections.  In that way, we resolved the huge
> PMD copy-on-write issue into PTE copy-on-write.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

Few comments see below.

> ---
>  mm/memory.c   |  2 ++
>  mm/mprotect.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 54 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 32d32b6e6339..b5d67bafae35 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2291,6 +2291,8 @@ vm_fault_t wp_page_copy(struct vm_fault *vmf)
>  		}
>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
> +		if (pte_uffd_wp(vmf->orig_pte))
> +			entry = pte_mkuffd_wp(entry);
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);

This looks wrong to me, isn't the uffd_wp flag clear on writeable pte ?
If so it would be clearer to have something like:

 +		if (pte_uffd_wp(vmf->orig_pte))
 +			entry = pte_mkuffd_wp(entry);
 +		else
 + 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);

>  		/*
>  		 * Clear the pte entry and flush it first, before updating the
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 9d4433044c21..ae93721f3795 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -77,14 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  		if (pte_present(oldpte)) {
>  			pte_t ptent;
>  			bool preserve_write = prot_numa && pte_write(oldpte);
> +			struct page *page;
>  
>  			/*
>  			 * Avoid trapping faults against the zero or KSM
>  			 * pages. See similar comment in change_huge_pmd.
>  			 */
>  			if (prot_numa) {
> -				struct page *page;
> -
>  				page = vm_normal_page(vma, addr, oldpte);
>  				if (!page || PageKsm(page))
>  					continue;
> @@ -114,6 +113,46 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  					continue;
>  			}
>  
> +			/*
> +			 * Detect whether we'll need to COW before
> +			 * resolving an uffd-wp fault.  Note that this
> +			 * includes detection of the zero page (where
> +			 * page==NULL)
> +			 */
> +			if (uffd_wp_resolve) {
> +				/* If the fault is resolved already, skip */
> +				if (!pte_uffd_wp(*pte))
> +					continue;
> +				page = vm_normal_page(vma, addr, oldpte);
> +				if (!page || page_mapcount(page) > 1) {

This is wrong, if you allow page to be NULL then you gonna segfault
in wp_page_copy() down below. Are you sure you want to test for
special page ? For anonymous memory this should never happens ie
anon page always are regular page. So if you allow userfaulfd to
write protect only anonymous vma then there is no point in testing
here beside maybe a BUG_ON() just in case ...

> +					struct vm_fault vmf = {
> +						.vma = vma,
> +						.address = addr & PAGE_MASK,
> +						.page = page,
> +						.orig_pte = oldpte,
> +						.pmd = pmd,
> +						/* pte and ptl not needed */
> +					};
> +					vm_fault_t ret;
> +
> +					if (page)
> +						get_page(page);
> +					arch_leave_lazy_mmu_mode();
> +					pte_unmap_unlock(pte, ptl);
> +					ret = wp_page_copy(&vmf);
> +					/* PTE is changed, or OOM */
> +					if (ret == 0)
> +						/* It's done by others */
> +						continue;
> +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> +						return pages;
> +					pte = pte_offset_map_lock(vma->vm_mm,
> +								  pmd, addr,
> +								  &ptl);

Here you remap the pte locked but you are not checking if the pte is
the one you expect ie is it pointing to the copied page and does it
have expect uffd_wp flag. Another thread might have raced between the
time you called wp_page_copy() and the time you pte_offset_map_lock()
I have not check the mmap_sem so maybe you are protected by it as
mprotect is taking it in write mode IIRC, if so you should add a
comments at very least so people do not see this as a bug.


> +					arch_enter_lazy_mmu_mode();
> +				}
> +			}
> +
>  			ptent = ptep_modify_prot_start(mm, addr, pte);
>  			ptent = pte_modify(ptent, newprot);
>  			if (preserve_write)
> @@ -183,6 +222,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>  	unsigned long pages = 0;
>  	unsigned long nr_huge_updates = 0;
>  	struct mmu_notifier_range range;
> +	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>  
>  	range.start = 0;
>  
> @@ -202,7 +242,16 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>  		}
>  
>  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> -			if (next - addr != HPAGE_PMD_SIZE) {
> +			/*
> +			 * When resolving an userfaultfd write
> +			 * protection fault, it's not easy to identify
> +			 * whether a THP is shared with others and
> +			 * whether we'll need to do copy-on-write, so
> +			 * just split it always for now to simply the
> +			 * procedure.  And that's the policy too for
> +			 * general THP write-protect in af9e4d5f2de2.
> +			 */
> +			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {

Using parenthesis maybe ? :)
            if ((next - addr != HPAGE_PMD_SIZE) || uffd_wp_resolve) {
Peter Xu Feb. 22, 2019, 8:46 a.m. UTC | #2
On Thu, Feb 21, 2019 at 01:04:24PM -0500, Jerome Glisse wrote:
> On Tue, Feb 12, 2019 at 10:56:20AM +0800, Peter Xu wrote:
> > This allows uffd-wp to support write-protected pages for COW.
> > 
> > For example, the uffd write-protected PTE could also be write-protected
> > by other usages like COW or zero pages.  When that happens, we can't
> > simply set the write bit in the PTE since otherwise it'll change the
> > content of every single reference to the page.  Instead, we should do
> > the COW first if necessary, then handle the uffd-wp fault.
> > 
> > To correctly copy the page, we'll also need to carry over the
> > _PAGE_UFFD_WP bit if it was set in the original PTE.
> > 
> > For huge PMDs, we just simply split the huge PMDs where we want to
> > resolve an uffd-wp page fault always.  That matches what we do with
> > general huge PMD write protections.  In that way, we resolved the huge
> > PMD copy-on-write issue into PTE copy-on-write.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> Few comments see below.
> 
> > ---
> >  mm/memory.c   |  2 ++
> >  mm/mprotect.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  2 files changed, 54 insertions(+), 3 deletions(-)
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 32d32b6e6339..b5d67bafae35 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2291,6 +2291,8 @@ vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >  		}
> >  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> >  		entry = mk_pte(new_page, vma->vm_page_prot);
> > +		if (pte_uffd_wp(vmf->orig_pte))
> > +			entry = pte_mkuffd_wp(entry);
> >  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> 
> This looks wrong to me, isn't the uffd_wp flag clear on writeable pte ?
> If so it would be clearer to have something like:
> 
>  +		if (pte_uffd_wp(vmf->orig_pte))
>  +			entry = pte_mkuffd_wp(entry);
>  +		else
>  + 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);

Yeah this seems clearer indeed.  The thing is that no matter whether
we set the write bit or not here we'll always set it again later on
simply because COW of uffd-wp pages only happen when resolving the wp
page fault (when we do want to set the write bit in all cases).
Anyway, I do like your suggestion and I'll fix.

> 
> >  		/*
> >  		 * Clear the pte entry and flush it first, before updating the
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 9d4433044c21..ae93721f3795 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -77,14 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >  		if (pte_present(oldpte)) {
> >  			pte_t ptent;
> >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > +			struct page *page;
> >  
> >  			/*
> >  			 * Avoid trapping faults against the zero or KSM
> >  			 * pages. See similar comment in change_huge_pmd.
> >  			 */
> >  			if (prot_numa) {
> > -				struct page *page;
> > -
> >  				page = vm_normal_page(vma, addr, oldpte);
> >  				if (!page || PageKsm(page))
> >  					continue;
> > @@ -114,6 +113,46 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >  					continue;
> >  			}
> >  
> > +			/*
> > +			 * Detect whether we'll need to COW before
> > +			 * resolving an uffd-wp fault.  Note that this
> > +			 * includes detection of the zero page (where
> > +			 * page==NULL)
> > +			 */
> > +			if (uffd_wp_resolve) {
> > +				/* If the fault is resolved already, skip */
> > +				if (!pte_uffd_wp(*pte))
> > +					continue;
> > +				page = vm_normal_page(vma, addr, oldpte);
> > +				if (!page || page_mapcount(page) > 1) {
> 
> This is wrong, if you allow page to be NULL then you gonna segfault
> in wp_page_copy() down below. Are you sure you want to test for
> special page ? For anonymous memory this should never happens ie
> anon page always are regular page. So if you allow userfaulfd to
> write protect only anonymous vma then there is no point in testing
> here beside maybe a BUG_ON() just in case ...

It's majorly for zero pages where page can be NULL.  Would this be
clearer:

  if (is_zero_pfn(pte_pfn(old_pte)) || (page && page_mapcount(page)))

?

Now we treat zero pages as normal COW pages so we'll do COW here even
for zero pages.  I think maybe we can do special handling on all over
the places for zero pages (e.g., we don't write protect a PTE if we
detected that this is the zero PFN) but I'm uncertain on whether
that's what we want, so I chose to start with current solution at
least to achieve functionality first.

> 
> > +					struct vm_fault vmf = {
> > +						.vma = vma,
> > +						.address = addr & PAGE_MASK,
> > +						.page = page,
> > +						.orig_pte = oldpte,
> > +						.pmd = pmd,
> > +						/* pte and ptl not needed */
> > +					};
> > +					vm_fault_t ret;
> > +
> > +					if (page)
> > +						get_page(page);
> > +					arch_leave_lazy_mmu_mode();
> > +					pte_unmap_unlock(pte, ptl);
> > +					ret = wp_page_copy(&vmf);
> > +					/* PTE is changed, or OOM */
> > +					if (ret == 0)
> > +						/* It's done by others */
> > +						continue;
> > +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> > +						return pages;
> > +					pte = pte_offset_map_lock(vma->vm_mm,
> > +								  pmd, addr,
> > +								  &ptl);
> 
> Here you remap the pte locked but you are not checking if the pte is
> the one you expect ie is it pointing to the copied page and does it
> have expect uffd_wp flag. Another thread might have raced between the
> time you called wp_page_copy() and the time you pte_offset_map_lock()
> I have not check the mmap_sem so maybe you are protected by it as
> mprotect is taking it in write mode IIRC, if so you should add a
> comments at very least so people do not see this as a bug.

Thanks for spotting this.  With nornal uffd-wp page fault handling
path we're only with read lock held (and I would suspect it's racy
even with write lock...).  I agree that there can be a race right
after the COW has done.

Here IMHO we'll be fine as long as it's still a present PTE, in other
words, we should be able to tolerate PTE changes as long as it's still
present otherwise we'll need to retry this single PTE (e.g., the page
can be quickly marked as migrating swap entry, or even the page could
be freed beneath us).  Do you think below change look good to you to
be squashed into this patch?

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 73a65f07fe41..3423f9692838 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -73,6 +73,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                              
        flush_tlb_batched_pending(vma->vm_mm);
        arch_enter_lazy_mmu_mode();
        do {
+retry_pte:
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
@@ -149,6 +150,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                           
                                        pte = pte_offset_map_lock(vma->vm_mm,
                                                                  pmd, addr,
                                                                  &ptl);
+                                       if (!pte_present(*pte))
+                                               /*
+                                                * This PTE could have
+                                                * been modified when COW;
+                                                * retry it
+                                                */
+                                               goto retry_pte;
                                        arch_enter_lazy_mmu_mode();
                                }
                        }

[...]

> > @@ -202,7 +242,16 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
> >  		}
> >  
> >  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> > -			if (next - addr != HPAGE_PMD_SIZE) {
> > +			/*
> > +			 * When resolving an userfaultfd write
> > +			 * protection fault, it's not easy to identify
> > +			 * whether a THP is shared with others and
> > +			 * whether we'll need to do copy-on-write, so
> > +			 * just split it always for now to simply the
> > +			 * procedure.  And that's the policy too for
> > +			 * general THP write-protect in af9e4d5f2de2.
> > +			 */
> > +			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {
> 
> Using parenthesis maybe ? :)
>             if ((next - addr != HPAGE_PMD_SIZE) || uffd_wp_resolve) {

Sure, will fix it.

Thanks,
Jerome Glisse Feb. 22, 2019, 3:35 p.m. UTC | #3
On Fri, Feb 22, 2019 at 04:46:03PM +0800, Peter Xu wrote:
> On Thu, Feb 21, 2019 at 01:04:24PM -0500, Jerome Glisse wrote:
> > On Tue, Feb 12, 2019 at 10:56:20AM +0800, Peter Xu wrote:
> > > This allows uffd-wp to support write-protected pages for COW.

[...]

> > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > index 9d4433044c21..ae93721f3795 100644
> > > --- a/mm/mprotect.c
> > > +++ b/mm/mprotect.c
> > > @@ -77,14 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > >  		if (pte_present(oldpte)) {
> > >  			pte_t ptent;
> > >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > > +			struct page *page;
> > >  
> > >  			/*
> > >  			 * Avoid trapping faults against the zero or KSM
> > >  			 * pages. See similar comment in change_huge_pmd.
> > >  			 */
> > >  			if (prot_numa) {
> > > -				struct page *page;
> > > -
> > >  				page = vm_normal_page(vma, addr, oldpte);
> > >  				if (!page || PageKsm(page))
> > >  					continue;
> > > @@ -114,6 +113,46 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > >  					continue;
> > >  			}
> > >  
> > > +			/*
> > > +			 * Detect whether we'll need to COW before
> > > +			 * resolving an uffd-wp fault.  Note that this
> > > +			 * includes detection of the zero page (where
> > > +			 * page==NULL)
> > > +			 */
> > > +			if (uffd_wp_resolve) {
> > > +				/* If the fault is resolved already, skip */
> > > +				if (!pte_uffd_wp(*pte))
> > > +					continue;
> > > +				page = vm_normal_page(vma, addr, oldpte);
> > > +				if (!page || page_mapcount(page) > 1) {
> > 
> > This is wrong, if you allow page to be NULL then you gonna segfault
> > in wp_page_copy() down below. Are you sure you want to test for
> > special page ? For anonymous memory this should never happens ie
> > anon page always are regular page. So if you allow userfaulfd to
> > write protect only anonymous vma then there is no point in testing
> > here beside maybe a BUG_ON() just in case ...
> 
> It's majorly for zero pages where page can be NULL.  Would this be
> clearer:
> 
>   if (is_zero_pfn(pte_pfn(old_pte)) || (page && page_mapcount(page)))
> 
> ?
> 
> Now we treat zero pages as normal COW pages so we'll do COW here even
> for zero pages.  I think maybe we can do special handling on all over
> the places for zero pages (e.g., we don't write protect a PTE if we
> detected that this is the zero PFN) but I'm uncertain on whether
> that's what we want, so I chose to start with current solution at
> least to achieve functionality first.

You can keep the vm_normal_page() in that case but split the if
between page == NULL and page != NULL with mapcount > 1. As other-
wise you will segfault below.


> 
> > 
> > > +					struct vm_fault vmf = {
> > > +						.vma = vma,
> > > +						.address = addr & PAGE_MASK,
> > > +						.page = page,
> > > +						.orig_pte = oldpte,
> > > +						.pmd = pmd,
> > > +						/* pte and ptl not needed */
> > > +					};
> > > +					vm_fault_t ret;
> > > +
> > > +					if (page)
> > > +						get_page(page);
> > > +					arch_leave_lazy_mmu_mode();
> > > +					pte_unmap_unlock(pte, ptl);
> > > +					ret = wp_page_copy(&vmf);
> > > +					/* PTE is changed, or OOM */
> > > +					if (ret == 0)
> > > +						/* It's done by others */
> > > +						continue;
> > > +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> > > +						return pages;
> > > +					pte = pte_offset_map_lock(vma->vm_mm,
> > > +								  pmd, addr,
> > > +								  &ptl);
> > 
> > Here you remap the pte locked but you are not checking if the pte is
> > the one you expect ie is it pointing to the copied page and does it
> > have expect uffd_wp flag. Another thread might have raced between the
> > time you called wp_page_copy() and the time you pte_offset_map_lock()
> > I have not check the mmap_sem so maybe you are protected by it as
> > mprotect is taking it in write mode IIRC, if so you should add a
> > comments at very least so people do not see this as a bug.
> 
> Thanks for spotting this.  With nornal uffd-wp page fault handling
> path we're only with read lock held (and I would suspect it's racy
> even with write lock...).  I agree that there can be a race right
> after the COW has done.
> 
> Here IMHO we'll be fine as long as it's still a present PTE, in other
> words, we should be able to tolerate PTE changes as long as it's still
> present otherwise we'll need to retry this single PTE (e.g., the page
> can be quickly marked as migrating swap entry, or even the page could
> be freed beneath us).  Do you think below change look good to you to
> be squashed into this patch?

Ok, but below if must be after arch_enter_lazy_mmu_mode(); not before.

> 
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 73a65f07fe41..3423f9692838 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -73,6 +73,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                              
>         flush_tlb_batched_pending(vma->vm_mm);
>         arch_enter_lazy_mmu_mode();
>         do {
> +retry_pte:
>                 oldpte = *pte;
>                 if (pte_present(oldpte)) {
>                         pte_t ptent;
> @@ -149,6 +150,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                           
>                                         pte = pte_offset_map_lock(vma->vm_mm,
>                                                                   pmd, addr,
>                                                                   &ptl);
> +                                       if (!pte_present(*pte))
> +                                               /*
> +                                                * This PTE could have
> +                                                * been modified when COW;
> +                                                * retry it
> +                                                */
> +                                               goto retry_pte;
>                                         arch_enter_lazy_mmu_mode();
>                                 }
>                         }
Peter Xu Feb. 25, 2019, 7:13 a.m. UTC | #4
On Fri, Feb 22, 2019 at 10:35:09AM -0500, Jerome Glisse wrote:
> On Fri, Feb 22, 2019 at 04:46:03PM +0800, Peter Xu wrote:
> > On Thu, Feb 21, 2019 at 01:04:24PM -0500, Jerome Glisse wrote:
> > > On Tue, Feb 12, 2019 at 10:56:20AM +0800, Peter Xu wrote:
> > > > This allows uffd-wp to support write-protected pages for COW.
> 
> [...]
> 
> > > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > > index 9d4433044c21..ae93721f3795 100644
> > > > --- a/mm/mprotect.c
> > > > +++ b/mm/mprotect.c
> > > > @@ -77,14 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > > >  		if (pte_present(oldpte)) {
> > > >  			pte_t ptent;
> > > >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > > > +			struct page *page;
> > > >  
> > > >  			/*
> > > >  			 * Avoid trapping faults against the zero or KSM
> > > >  			 * pages. See similar comment in change_huge_pmd.
> > > >  			 */
> > > >  			if (prot_numa) {
> > > > -				struct page *page;
> > > > -
> > > >  				page = vm_normal_page(vma, addr, oldpte);
> > > >  				if (!page || PageKsm(page))
> > > >  					continue;
> > > > @@ -114,6 +113,46 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > > >  					continue;
> > > >  			}
> > > >  
> > > > +			/*
> > > > +			 * Detect whether we'll need to COW before
> > > > +			 * resolving an uffd-wp fault.  Note that this
> > > > +			 * includes detection of the zero page (where
> > > > +			 * page==NULL)
> > > > +			 */
> > > > +			if (uffd_wp_resolve) {
> > > > +				/* If the fault is resolved already, skip */
> > > > +				if (!pte_uffd_wp(*pte))
> > > > +					continue;
> > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > +				if (!page || page_mapcount(page) > 1) {
> > > 
> > > This is wrong, if you allow page to be NULL then you gonna segfault
> > > in wp_page_copy() down below. Are you sure you want to test for
> > > special page ? For anonymous memory this should never happens ie
> > > anon page always are regular page. So if you allow userfaulfd to
> > > write protect only anonymous vma then there is no point in testing
> > > here beside maybe a BUG_ON() just in case ...
> > 
> > It's majorly for zero pages where page can be NULL.  Would this be
> > clearer:
> > 
> >   if (is_zero_pfn(pte_pfn(old_pte)) || (page && page_mapcount(page)))
> > 
> > ?
> > 
> > Now we treat zero pages as normal COW pages so we'll do COW here even
> > for zero pages.  I think maybe we can do special handling on all over
> > the places for zero pages (e.g., we don't write protect a PTE if we
> > detected that this is the zero PFN) but I'm uncertain on whether
> > that's what we want, so I chose to start with current solution at
> > least to achieve functionality first.
> 
> You can keep the vm_normal_page() in that case but split the if
> between page == NULL and page != NULL with mapcount > 1. As other-
> wise you will segfault below.

Could I ask what's the segfault you mentioned?  My understanding is
that below code has taken page==NULL into consideration already, e.g.,
we only do get_page() if page!=NULL, and inside wp_page_copy() it has
similar considerations.

> 
> 
> > 
> > > 
> > > > +					struct vm_fault vmf = {
> > > > +						.vma = vma,
> > > > +						.address = addr & PAGE_MASK,
> > > > +						.page = page,
> > > > +						.orig_pte = oldpte,
> > > > +						.pmd = pmd,
> > > > +						/* pte and ptl not needed */
> > > > +					};
> > > > +					vm_fault_t ret;
> > > > +
> > > > +					if (page)
> > > > +						get_page(page);
> > > > +					arch_leave_lazy_mmu_mode();
> > > > +					pte_unmap_unlock(pte, ptl);
> > > > +					ret = wp_page_copy(&vmf);
> > > > +					/* PTE is changed, or OOM */
> > > > +					if (ret == 0)
> > > > +						/* It's done by others */
> > > > +						continue;
> > > > +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> > > > +						return pages;
> > > > +					pte = pte_offset_map_lock(vma->vm_mm,
> > > > +								  pmd, addr,
> > > > +								  &ptl);
> > > 
> > > Here you remap the pte locked but you are not checking if the pte is
> > > the one you expect ie is it pointing to the copied page and does it
> > > have expect uffd_wp flag. Another thread might have raced between the
> > > time you called wp_page_copy() and the time you pte_offset_map_lock()
> > > I have not check the mmap_sem so maybe you are protected by it as
> > > mprotect is taking it in write mode IIRC, if so you should add a
> > > comments at very least so people do not see this as a bug.
> > 
> > Thanks for spotting this.  With nornal uffd-wp page fault handling
> > path we're only with read lock held (and I would suspect it's racy
> > even with write lock...).  I agree that there can be a race right
> > after the COW has done.
> > 
> > Here IMHO we'll be fine as long as it's still a present PTE, in other
> > words, we should be able to tolerate PTE changes as long as it's still
> > present otherwise we'll need to retry this single PTE (e.g., the page
> > can be quickly marked as migrating swap entry, or even the page could
> > be freed beneath us).  Do you think below change look good to you to
> > be squashed into this patch?
> 
> Ok, but below if must be after arch_enter_lazy_mmu_mode(); not before.

Oops... you are right. :)

Thanks,

> 
> > 
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 73a65f07fe41..3423f9692838 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -73,6 +73,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                              
> >         flush_tlb_batched_pending(vma->vm_mm);
> >         arch_enter_lazy_mmu_mode();
> >         do {
> > +retry_pte:
> >                 oldpte = *pte;
> >                 if (pte_present(oldpte)) {
> >                         pte_t ptent;
> > @@ -149,6 +150,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                           
> >                                         pte = pte_offset_map_lock(vma->vm_mm,
> >                                                                   pmd, addr,
> >                                                                   &ptl);
> > +                                       if (!pte_present(*pte))
> > +                                               /*
> > +                                                * This PTE could have
> > +                                                * been modified when COW;
> > +                                                * retry it
> > +                                                */
> > +                                               goto retry_pte;
> >                                         arch_enter_lazy_mmu_mode();
> >                                 }
> >                         }
Jerome Glisse Feb. 25, 2019, 3:32 p.m. UTC | #5
On Mon, Feb 25, 2019 at 03:13:36PM +0800, Peter Xu wrote:
> On Fri, Feb 22, 2019 at 10:35:09AM -0500, Jerome Glisse wrote:
> > On Fri, Feb 22, 2019 at 04:46:03PM +0800, Peter Xu wrote:
> > > On Thu, Feb 21, 2019 at 01:04:24PM -0500, Jerome Glisse wrote:
> > > > On Tue, Feb 12, 2019 at 10:56:20AM +0800, Peter Xu wrote:
> > > > > This allows uffd-wp to support write-protected pages for COW.
> > 
> > [...]
> > 
> > > > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > > > index 9d4433044c21..ae93721f3795 100644
> > > > > --- a/mm/mprotect.c
> > > > > +++ b/mm/mprotect.c
> > > > > @@ -77,14 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > > > >  		if (pte_present(oldpte)) {
> > > > >  			pte_t ptent;
> > > > >  			bool preserve_write = prot_numa && pte_write(oldpte);
> > > > > +			struct page *page;
> > > > >  
> > > > >  			/*
> > > > >  			 * Avoid trapping faults against the zero or KSM
> > > > >  			 * pages. See similar comment in change_huge_pmd.
> > > > >  			 */
> > > > >  			if (prot_numa) {
> > > > > -				struct page *page;
> > > > > -
> > > > >  				page = vm_normal_page(vma, addr, oldpte);
> > > > >  				if (!page || PageKsm(page))
> > > > >  					continue;
> > > > > @@ -114,6 +113,46 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > > > >  					continue;
> > > > >  			}
> > > > >  
> > > > > +			/*
> > > > > +			 * Detect whether we'll need to COW before
> > > > > +			 * resolving an uffd-wp fault.  Note that this
> > > > > +			 * includes detection of the zero page (where
> > > > > +			 * page==NULL)
> > > > > +			 */
> > > > > +			if (uffd_wp_resolve) {
> > > > > +				/* If the fault is resolved already, skip */
> > > > > +				if (!pte_uffd_wp(*pte))
> > > > > +					continue;
> > > > > +				page = vm_normal_page(vma, addr, oldpte);
> > > > > +				if (!page || page_mapcount(page) > 1) {
> > > > 
> > > > This is wrong, if you allow page to be NULL then you gonna segfault
> > > > in wp_page_copy() down below. Are you sure you want to test for
> > > > special page ? For anonymous memory this should never happens ie
> > > > anon page always are regular page. So if you allow userfaulfd to
> > > > write protect only anonymous vma then there is no point in testing
> > > > here beside maybe a BUG_ON() just in case ...
> > > 
> > > It's majorly for zero pages where page can be NULL.  Would this be
> > > clearer:
> > > 
> > >   if (is_zero_pfn(pte_pfn(old_pte)) || (page && page_mapcount(page)))
> > > 
> > > ?
> > > 
> > > Now we treat zero pages as normal COW pages so we'll do COW here even
> > > for zero pages.  I think maybe we can do special handling on all over
> > > the places for zero pages (e.g., we don't write protect a PTE if we
> > > detected that this is the zero PFN) but I'm uncertain on whether
> > > that's what we want, so I chose to start with current solution at
> > > least to achieve functionality first.
> > 
> > You can keep the vm_normal_page() in that case but split the if
> > between page == NULL and page != NULL with mapcount > 1. As other-
> > wise you will segfault below.
> 
> Could I ask what's the segfault you mentioned?  My understanding is
> that below code has taken page==NULL into consideration already, e.g.,
> we only do get_page() if page!=NULL, and inside wp_page_copy() it has
> similar considerations.

In my memory wp_page_copy() would have freak out on NULL page but
i check that code again and it is fine. So yes you can take that
branch for NULL page too. Sorry i trusted my memory too much.


> > > > > +					struct vm_fault vmf = {
> > > > > +						.vma = vma,
> > > > > +						.address = addr & PAGE_MASK,
> > > > > +						.page = page,
> > > > > +						.orig_pte = oldpte,
> > > > > +						.pmd = pmd,
> > > > > +						/* pte and ptl not needed */
> > > > > +					};
> > > > > +					vm_fault_t ret;
> > > > > +
> > > > > +					if (page)
> > > > > +						get_page(page);
> > > > > +					arch_leave_lazy_mmu_mode();
> > > > > +					pte_unmap_unlock(pte, ptl);
> > > > > +					ret = wp_page_copy(&vmf);
> > > > > +					/* PTE is changed, or OOM */
> > > > > +					if (ret == 0)
> > > > > +						/* It's done by others */
> > > > > +						continue;
> > > > > +					else if (WARN_ON(ret != VM_FAULT_WRITE))
> > > > > +						return pages;
> > > > > +					pte = pte_offset_map_lock(vma->vm_mm,
> > > > > +								  pmd, addr,
> > > > > +								  &ptl);
> > > > 
> > > > Here you remap the pte locked but you are not checking if the pte is
> > > > the one you expect ie is it pointing to the copied page and does it
> > > > have expect uffd_wp flag. Another thread might have raced between the
> > > > time you called wp_page_copy() and the time you pte_offset_map_lock()
> > > > I have not check the mmap_sem so maybe you are protected by it as
> > > > mprotect is taking it in write mode IIRC, if so you should add a
> > > > comments at very least so people do not see this as a bug.
> > > 
> > > Thanks for spotting this.  With nornal uffd-wp page fault handling
> > > path we're only with read lock held (and I would suspect it's racy
> > > even with write lock...).  I agree that there can be a race right
> > > after the COW has done.
> > > 
> > > Here IMHO we'll be fine as long as it's still a present PTE, in other
> > > words, we should be able to tolerate PTE changes as long as it's still
> > > present otherwise we'll need to retry this single PTE (e.g., the page
> > > can be quickly marked as migrating swap entry, or even the page could
> > > be freed beneath us).  Do you think below change look good to you to
> > > be squashed into this patch?
> > 
> > Ok, but below if must be after arch_enter_lazy_mmu_mode(); not before.
> 
> Oops... you are right. :)
> 
> Thanks,
> 
> > 
> > > 
> > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > index 73a65f07fe41..3423f9692838 100644
> > > --- a/mm/mprotect.c
> > > +++ b/mm/mprotect.c
> > > @@ -73,6 +73,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                              
> > >         flush_tlb_batched_pending(vma->vm_mm);
> > >         arch_enter_lazy_mmu_mode();
> > >         do {
> > > +retry_pte:
> > >                 oldpte = *pte;
> > >                 if (pte_present(oldpte)) {
> > >                         pte_t ptent;
> > > @@ -149,6 +150,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,                                                           
> > >                                         pte = pte_offset_map_lock(vma->vm_mm,
> > >                                                                   pmd, addr,
> > >                                                                   &ptl);
> > > +                                       if (!pte_present(*pte))
> > > +                                               /*
> > > +                                                * This PTE could have
> > > +                                                * been modified when COW;
> > > +                                                * retry it
> > > +                                                */
> > > +                                               goto retry_pte;
> > >                                         arch_enter_lazy_mmu_mode();
> > >                                 }
> > >                         }
> 
> -- 
> Peter Xu
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 32d32b6e6339..b5d67bafae35 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2291,6 +2291,8 @@  vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
+		if (pte_uffd_wp(vmf->orig_pte))
+			entry = pte_mkuffd_wp(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9d4433044c21..ae93721f3795 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -77,14 +77,13 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 			bool preserve_write = prot_numa && pte_write(oldpte);
+			struct page *page;
 
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
 			 */
 			if (prot_numa) {
-				struct page *page;
-
 				page = vm_normal_page(vma, addr, oldpte);
 				if (!page || PageKsm(page))
 					continue;
@@ -114,6 +113,46 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 					continue;
 			}
 
+			/*
+			 * Detect whether we'll need to COW before
+			 * resolving an uffd-wp fault.  Note that this
+			 * includes detection of the zero page (where
+			 * page==NULL)
+			 */
+			if (uffd_wp_resolve) {
+				/* If the fault is resolved already, skip */
+				if (!pte_uffd_wp(*pte))
+					continue;
+				page = vm_normal_page(vma, addr, oldpte);
+				if (!page || page_mapcount(page) > 1) {
+					struct vm_fault vmf = {
+						.vma = vma,
+						.address = addr & PAGE_MASK,
+						.page = page,
+						.orig_pte = oldpte,
+						.pmd = pmd,
+						/* pte and ptl not needed */
+					};
+					vm_fault_t ret;
+
+					if (page)
+						get_page(page);
+					arch_leave_lazy_mmu_mode();
+					pte_unmap_unlock(pte, ptl);
+					ret = wp_page_copy(&vmf);
+					/* PTE is changed, or OOM */
+					if (ret == 0)
+						/* It's done by others */
+						continue;
+					else if (WARN_ON(ret != VM_FAULT_WRITE))
+						return pages;
+					pte = pte_offset_map_lock(vma->vm_mm,
+								  pmd, addr,
+								  &ptl);
+					arch_enter_lazy_mmu_mode();
+				}
+			}
+
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
 			if (preserve_write)
@@ -183,6 +222,7 @@  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 	unsigned long pages = 0;
 	unsigned long nr_huge_updates = 0;
 	struct mmu_notifier_range range;
+	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
 	range.start = 0;
 
@@ -202,7 +242,16 @@  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		}
 
 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-			if (next - addr != HPAGE_PMD_SIZE) {
+			/*
+			 * When resolving an userfaultfd write
+			 * protection fault, it's not easy to identify
+			 * whether a THP is shared with others and
+			 * whether we'll need to do copy-on-write, so
+			 * just split it always for now to simply the
+			 * procedure.  And that's the policy too for
+			 * general THP write-protect in af9e4d5f2de2.
+			 */
+			if (next - addr != HPAGE_PMD_SIZE || uffd_wp_resolve) {
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
 			} else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,