diff mbox series

[RFC,06/24] userfaultfd: wp: support write protection for userfault vma range

Message ID 20190121075722.7945-7-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Jan. 21, 2019, 7:57 a.m. UTC
From: Shaohua Li <shli@fb.com>

Add API to enable/disable writeprotect a vma range. Unlike mprotect,
this doesn't split/merge vmas.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/userfaultfd_k.h |  2 ++
 mm/userfaultfd.c              | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

Comments

Mike Rapoport Jan. 21, 2019, 10:20 a.m. UTC | #1
On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote:
> From: Shaohua Li <shli@fb.com>
> 
> Add API to enable/disable writeprotect a vma range. Unlike mprotect,
> this doesn't split/merge vmas.
> 
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Pavel Emelyanov <xemul@parallels.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Shaohua Li <shli@fb.com>
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/linux/userfaultfd_k.h |  2 ++
>  mm/userfaultfd.c              | 52 +++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 38f748e7186e..e82f3156f4e9 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
>  			      unsigned long dst_start,
>  			      unsigned long len,
>  			      bool *mmap_changing);
> +extern int mwriteprotect_range(struct mm_struct *dst_mm,
> +		unsigned long start, unsigned long len, bool enable_wp);
> 
>  /* mm helpers */
>  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 458acda96f20..c38903f501c7 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
>  {
>  	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
>  }
> +
> +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> +	unsigned long len, bool enable_wp)
> +{
> +	struct vm_area_struct *dst_vma;
> +	pgprot_t newprot;
> +	int err;
> +
> +	/*
> +	 * Sanitize the command parameters:
> +	 */
> +	BUG_ON(start & ~PAGE_MASK);
> +	BUG_ON(len & ~PAGE_MASK);
> +
> +	/* Does the address range wrap, or is the span zero-sized? */
> +	BUG_ON(start + len <= start);
> +
> +	down_read(&dst_mm->mmap_sem);
> +
> +	/*
> +	 * Make sure the vma is not shared, that the dst range is
> +	 * both valid and fully within a single existing vma.
> +	 */
> +	err = -EINVAL;

In non-cooperative mode, there can be a race between VM layout changes and
mcopy_atomic [1]. I believe the same races are possible here, so can we
please make err = -ENOENT for consistency with mcopy?

> +	dst_vma = find_vma(dst_mm, start);
> +	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> +		goto out_unlock;
> +	if (start < dst_vma->vm_start ||
> +	    start + len > dst_vma->vm_end)
> +		goto out_unlock;
> +
> +	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> +		goto out_unlock;
> +	if (!userfaultfd_wp(dst_vma))
> +		goto out_unlock;
> +
> +	if (!vma_is_anonymous(dst_vma))
> +		goto out_unlock;

The sanity checks here seem to repeat those in mcopy_atomic(). I'd suggest
splitting them out to a helper function.

> +	if (enable_wp)
> +		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
> +	else
> +		newprot = vm_get_page_prot(dst_vma->vm_flags);
> +
> +	change_protection(dst_vma, start, start + len, newprot,
> +				!enable_wp, 0);
> +
> +	err = 0;
> +out_unlock:
> +	up_read(&dst_mm->mmap_sem);
> +	return err;
> +}
> -- 
> 2.17.1

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27d02568f529e908399514dfbee8ee43bdfd5299
Jerome Glisse Jan. 21, 2019, 2:05 p.m. UTC | #2
On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote:
> From: Shaohua Li <shli@fb.com>
> 
> Add API to enable/disable writeprotect a vma range. Unlike mprotect,
> this doesn't split/merge vmas.

AFAICT it does not do that.

> 
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Pavel Emelyanov <xemul@parallels.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Shaohua Li <shli@fb.com>
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/linux/userfaultfd_k.h |  2 ++
>  mm/userfaultfd.c              | 52 +++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 38f748e7186e..e82f3156f4e9 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
>  			      unsigned long dst_start,
>  			      unsigned long len,
>  			      bool *mmap_changing);
> +extern int mwriteprotect_range(struct mm_struct *dst_mm,
> +		unsigned long start, unsigned long len, bool enable_wp);
>  
>  /* mm helpers */
>  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 458acda96f20..c38903f501c7 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
>  {
>  	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
>  }
> +
> +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> +	unsigned long len, bool enable_wp)
> +{
> +	struct vm_area_struct *dst_vma;
> +	pgprot_t newprot;
> +	int err;
> +
> +	/*
> +	 * Sanitize the command parameters:
> +	 */
> +	BUG_ON(start & ~PAGE_MASK);
> +	BUG_ON(len & ~PAGE_MASK);
> +
> +	/* Does the address range wrap, or is the span zero-sized? */
> +	BUG_ON(start + len <= start);
> +
> +	down_read(&dst_mm->mmap_sem);
> +
> +	/*
> +	 * Make sure the vma is not shared, that the dst range is
> +	 * both valid and fully within a single existing vma.
> +	 */
> +	err = -EINVAL;
> +	dst_vma = find_vma(dst_mm, start);
> +	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> +		goto out_unlock;
> +	if (start < dst_vma->vm_start ||
> +	    start + len > dst_vma->vm_end)
> +		goto out_unlock;
> +
> +	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> +		goto out_unlock;
> +	if (!userfaultfd_wp(dst_vma))
> +		goto out_unlock;
> +
> +	if (!vma_is_anonymous(dst_vma))
> +		goto out_unlock;
> +
> +	if (enable_wp)
> +		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
> +	else
> +		newprot = vm_get_page_prot(dst_vma->vm_flags);
> +
> +	change_protection(dst_vma, start, start + len, newprot,
> +				!enable_wp, 0);

So setting dirty_accountable bring us to that code in mprotect.c:

    if (dirty_accountable && pte_dirty(ptent) &&
            (pte_soft_dirty(ptent) ||
             !(vma->vm_flags & VM_SOFTDIRTY))) {
        ptent = pte_mkwrite(ptent);
    }

My understanding is that you want to set write flag when enable_wp
is false and you want to set the write flag unconditionaly, right ?

If so then you should really move the change_protection() flags
patch before this patch and add a flag for setting pte write flags.

Otherwise the above is broken at it will only set the write flag
for pte that were dirty and i am guessing so far you always were
lucky because pte were all dirty (change_protection will preserve
dirtyness) when you write protected them.

So i believe the above is broken or at very least unclear if what
you really want is to only set write flag to pte that have the
dirty flag set.


Cheers,
Jérôme


> +
> +	err = 0;
> +out_unlock:
> +	up_read(&dst_mm->mmap_sem);
> +	return err;
> +}
> -- 
> 2.17.1
>
Peter Xu Jan. 22, 2019, 8:55 a.m. UTC | #3
On Mon, Jan 21, 2019 at 12:20:35PM +0200, Mike Rapoport wrote:
> On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote:
> > From: Shaohua Li <shli@fb.com>
> > 
> > Add API to enable/disable writeprotect a vma range. Unlike mprotect,
> > this doesn't split/merge vmas.
> > 
> > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > Cc: Pavel Emelyanov <xemul@parallels.com>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: Kirill A. Shutemov <kirill@shutemov.name>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  include/linux/userfaultfd_k.h |  2 ++
> >  mm/userfaultfd.c              | 52 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 54 insertions(+)
> > 
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 38f748e7186e..e82f3156f4e9 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
> >  			      unsigned long dst_start,
> >  			      unsigned long len,
> >  			      bool *mmap_changing);
> > +extern int mwriteprotect_range(struct mm_struct *dst_mm,
> > +		unsigned long start, unsigned long len, bool enable_wp);
> > 
> >  /* mm helpers */
> >  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 458acda96f20..c38903f501c7 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
> >  {
> >  	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
> >  }
> > +
> > +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> > +	unsigned long len, bool enable_wp)
> > +{
> > +	struct vm_area_struct *dst_vma;
> > +	pgprot_t newprot;
> > +	int err;
> > +
> > +	/*
> > +	 * Sanitize the command parameters:
> > +	 */
> > +	BUG_ON(start & ~PAGE_MASK);
> > +	BUG_ON(len & ~PAGE_MASK);
> > +
> > +	/* Does the address range wrap, or is the span zero-sized? */
> > +	BUG_ON(start + len <= start);
> > +
> > +	down_read(&dst_mm->mmap_sem);
> > +
> > +	/*
> > +	 * Make sure the vma is not shared, that the dst range is
> > +	 * both valid and fully within a single existing vma.
> > +	 */
> > +	err = -EINVAL;
> 
> In non-cooperative mode, there can be a race between VM layout changes and
> mcopy_atomic [1]. I believe the same races are possible here, so can we
> please make err = -ENOENT for consistency with mcopy?

Sure.

> 
> > +	dst_vma = find_vma(dst_mm, start);
> > +	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > +		goto out_unlock;
> > +	if (start < dst_vma->vm_start ||
> > +	    start + len > dst_vma->vm_end)
> > +		goto out_unlock;
> > +
> > +	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > +		goto out_unlock;
> > +	if (!userfaultfd_wp(dst_vma))
> > +		goto out_unlock;
> > +
> > +	if (!vma_is_anonymous(dst_vma))
> > +		goto out_unlock;
> 
> The sanity checks here seem to repeat those in mcopy_atomic(). I'd suggest
> splitting them out to a helper function.

It's a good suggestion.  Thanks!

> 
> > +	if (enable_wp)
> > +		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
> > +	else
> > +		newprot = vm_get_page_prot(dst_vma->vm_flags);
> > +
> > +	change_protection(dst_vma, start, start + len, newprot,
> > +				!enable_wp, 0);
> > +
> > +	err = 0;
> > +out_unlock:
> > +	up_read(&dst_mm->mmap_sem);
> > +	return err;
> > +}
> > -- 
> > 2.17.1
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27d02568f529e908399514dfbee8ee43bdfd5299
> 
> -- 
> Sincerely yours,
> Mike.
>
Peter Xu Jan. 22, 2019, 9:39 a.m. UTC | #4
On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote:

[...]

> > +	change_protection(dst_vma, start, start + len, newprot,
> > +				!enable_wp, 0);
> 
> So setting dirty_accountable bring us to that code in mprotect.c:
> 
>     if (dirty_accountable && pte_dirty(ptent) &&
>             (pte_soft_dirty(ptent) ||
>              !(vma->vm_flags & VM_SOFTDIRTY))) {
>         ptent = pte_mkwrite(ptent);
>     }
> 
> My understanding is that you want to set write flag when enable_wp
> is false and you want to set the write flag unconditionaly, right ?

Right.

> 
> If so then you should really move the change_protection() flags
> patch before this patch and add a flag for setting pte write flags.
> 
> Otherwise the above is broken at it will only set the write flag
> for pte that were dirty and i am guessing so far you always were
> lucky because pte were all dirty (change_protection will preserve
> dirtyness) when you write protected them.
> 
> So i believe the above is broken or at very least unclear if what
> you really want is to only set write flag to pte that have the
> dirty flag set.

You are right, if we build the tree until this patch it won't work for
all the cases.  It'll only work if the page was at least writable
before and also it's dirty (as you explained).  Sorry to be unclear
about this, maybe I should at least mention that in the commit message
but I totally forgot it.

All these problems are solved in later on patches, please feel free to
have a look at:

  mm: merge parameters for change_protection()
  userfaultfd: wp: apply _PAGE_UFFD_WP bit
  userfaultfd: wp: handle COW properly for uffd-wp

Note that even in the follow up patches IMHO we can't directly change
the write permission since the page can be shared by other processes
(e.g., the zero page or COW pages).  But the general idea is the same
as you explained.

I tried to avoid squashing these stuff altogether as explained
previously.  Also, this patch can be seen as a standalone patch to
introduce the new interface which seems to make sense too, and it is
indeed still working in many cases so I see the latter patches as
enhancement of this one.  Please let me know if you still want me to
have all these stuff squashed, or if you'd like me to squash some of
them.

Thanks!
Jerome Glisse Jan. 22, 2019, 5:02 p.m. UTC | #5
On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote:
> On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote:
> 
> [...]
> 
> > > +	change_protection(dst_vma, start, start + len, newprot,
> > > +				!enable_wp, 0);
> > 
> > So setting dirty_accountable bring us to that code in mprotect.c:
> > 
> >     if (dirty_accountable && pte_dirty(ptent) &&
> >             (pte_soft_dirty(ptent) ||
> >              !(vma->vm_flags & VM_SOFTDIRTY))) {
> >         ptent = pte_mkwrite(ptent);
> >     }
> > 
> > My understanding is that you want to set write flag when enable_wp
> > is false and you want to set the write flag unconditionaly, right ?
> 
> Right.
> 
> > 
> > If so then you should really move the change_protection() flags
> > patch before this patch and add a flag for setting pte write flags.
> > 
> > Otherwise the above is broken at it will only set the write flag
> > for pte that were dirty and i am guessing so far you always were
> > lucky because pte were all dirty (change_protection will preserve
> > dirtyness) when you write protected them.
> > 
> > So i believe the above is broken or at very least unclear if what
> > you really want is to only set write flag to pte that have the
> > dirty flag set.
> 
> You are right, if we build the tree until this patch it won't work for
> all the cases.  It'll only work if the page was at least writable
> before and also it's dirty (as you explained).  Sorry to be unclear
> about this, maybe I should at least mention that in the commit message
> but I totally forgot it.
> 
> All these problems are solved in later on patches, please feel free to
> have a look at:
> 
>   mm: merge parameters for change_protection()
>   userfaultfd: wp: apply _PAGE_UFFD_WP bit
>   userfaultfd: wp: handle COW properly for uffd-wp
> 
> Note that even in the follow up patches IMHO we can't directly change
> the write permission since the page can be shared by other processes
> (e.g., the zero page or COW pages).  But the general idea is the same
> as you explained.
> 
> I tried to avoid squashing these stuff altogether as explained
> previously.  Also, this patch can be seen as a standalone patch to
> introduce the new interface which seems to make sense too, and it is
> indeed still working in many cases so I see the latter patches as
> enhancement of this one.  Please let me know if you still want me to
> have all these stuff squashed, or if you'd like me to squash some of
> them.

Yeah i have look at those after looking at this one. You should just
re-order the patch this one first and then one that add new flag,
then ones that add the new userfaultfd feature. Otherwise you are
adding a userfaultfd feature that is broken midway ie it is added
broken and then you fix it. Some one bisecting thing might get hurt
by that. It is better to add and change everything you need and then
add the new feature so that the new feature will work as intended.

So no squashing just change the order ie add the userfaultfd code
last.

Cheers,
Jérôme
Peter Xu Jan. 23, 2019, 2:17 a.m. UTC | #6
On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote:
> On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote:
> > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote:
> > 
> > [...]
> > 
> > > > +	change_protection(dst_vma, start, start + len, newprot,
> > > > +				!enable_wp, 0);
> > > 
> > > So setting dirty_accountable bring us to that code in mprotect.c:
> > > 
> > >     if (dirty_accountable && pte_dirty(ptent) &&
> > >             (pte_soft_dirty(ptent) ||
> > >              !(vma->vm_flags & VM_SOFTDIRTY))) {
> > >         ptent = pte_mkwrite(ptent);
> > >     }
> > > 
> > > My understanding is that you want to set write flag when enable_wp
> > > is false and you want to set the write flag unconditionaly, right ?
> > 
> > Right.
> > 
> > > 
> > > If so then you should really move the change_protection() flags
> > > patch before this patch and add a flag for setting pte write flags.
> > > 
> > > Otherwise the above is broken at it will only set the write flag
> > > for pte that were dirty and i am guessing so far you always were
> > > lucky because pte were all dirty (change_protection will preserve
> > > dirtyness) when you write protected them.
> > > 
> > > So i believe the above is broken or at very least unclear if what
> > > you really want is to only set write flag to pte that have the
> > > dirty flag set.
> > 
> > You are right, if we build the tree until this patch it won't work for
> > all the cases.  It'll only work if the page was at least writable
> > before and also it's dirty (as you explained).  Sorry to be unclear
> > about this, maybe I should at least mention that in the commit message
> > but I totally forgot it.
> > 
> > All these problems are solved in later on patches, please feel free to
> > have a look at:
> > 
> >   mm: merge parameters for change_protection()
> >   userfaultfd: wp: apply _PAGE_UFFD_WP bit
> >   userfaultfd: wp: handle COW properly for uffd-wp
> > 
> > Note that even in the follow up patches IMHO we can't directly change
> > the write permission since the page can be shared by other processes
> > (e.g., the zero page or COW pages).  But the general idea is the same
> > as you explained.
> > 
> > I tried to avoid squashing these stuff altogether as explained
> > previously.  Also, this patch can be seen as a standalone patch to
> > introduce the new interface which seems to make sense too, and it is
> > indeed still working in many cases so I see the latter patches as
> > enhancement of this one.  Please let me know if you still want me to
> > have all these stuff squashed, or if you'd like me to squash some of
> > them.
> 
> Yeah i have look at those after looking at this one. You should just
> re-order the patch this one first and then one that add new flag,
> then ones that add the new userfaultfd feature. Otherwise you are
> adding a userfaultfd feature that is broken midway ie it is added
> broken and then you fix it. Some one bisecting thing might get hurt
> by that. It is better to add and change everything you need and then
> add the new feature so that the new feature will work as intended.
> 
> So no squashing just change the order ie add the userfaultfd code
> last.

Yes this makes sense, I'll do that in v2.  Thanks for the suggestion!
Jerome Glisse Jan. 23, 2019, 2:43 a.m. UTC | #7
On Wed, Jan 23, 2019 at 10:17:45AM +0800, Peter Xu wrote:
> On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote:
> > On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote:
> > > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote:
> > > 
> > > [...]
> > > 
> > > > > +	change_protection(dst_vma, start, start + len, newprot,
> > > > > +				!enable_wp, 0);
> > > > 
> > > > So setting dirty_accountable bring us to that code in mprotect.c:
> > > > 
> > > >     if (dirty_accountable && pte_dirty(ptent) &&
> > > >             (pte_soft_dirty(ptent) ||
> > > >              !(vma->vm_flags & VM_SOFTDIRTY))) {
> > > >         ptent = pte_mkwrite(ptent);
> > > >     }
> > > > 
> > > > My understanding is that you want to set write flag when enable_wp
> > > > is false and you want to set the write flag unconditionaly, right ?
> > > 
> > > Right.
> > > 
> > > > 
> > > > If so then you should really move the change_protection() flags
> > > > patch before this patch and add a flag for setting pte write flags.
> > > > 
> > > > Otherwise the above is broken at it will only set the write flag
> > > > for pte that were dirty and i am guessing so far you always were
> > > > lucky because pte were all dirty (change_protection will preserve
> > > > dirtyness) when you write protected them.
> > > > 
> > > > So i believe the above is broken or at very least unclear if what
> > > > you really want is to only set write flag to pte that have the
> > > > dirty flag set.
> > > 
> > > You are right, if we build the tree until this patch it won't work for
> > > all the cases.  It'll only work if the page was at least writable
> > > before and also it's dirty (as you explained).  Sorry to be unclear
> > > about this, maybe I should at least mention that in the commit message
> > > but I totally forgot it.
> > > 
> > > All these problems are solved in later on patches, please feel free to
> > > have a look at:
> > > 
> > >   mm: merge parameters for change_protection()
> > >   userfaultfd: wp: apply _PAGE_UFFD_WP bit
> > >   userfaultfd: wp: handle COW properly for uffd-wp
> > > 
> > > Note that even in the follow up patches IMHO we can't directly change
> > > the write permission since the page can be shared by other processes
> > > (e.g., the zero page or COW pages).  But the general idea is the same
> > > as you explained.
> > > 
> > > I tried to avoid squashing these stuff altogether as explained
> > > previously.  Also, this patch can be seen as a standalone patch to
> > > introduce the new interface which seems to make sense too, and it is
> > > indeed still working in many cases so I see the latter patches as
> > > enhancement of this one.  Please let me know if you still want me to
> > > have all these stuff squashed, or if you'd like me to squash some of
> > > them.
> > 
> > Yeah i have look at those after looking at this one. You should just
> > re-order the patch this one first and then one that add new flag,
> > then ones that add the new userfaultfd feature. Otherwise you are
> > adding a userfaultfd feature that is broken midway ie it is added
> > broken and then you fix it. Some one bisecting thing might get hurt
> > by that. It is better to add and change everything you need and then
> > add the new feature so that the new feature will work as intended.
> > 
> > So no squashing just change the order ie add the userfaultfd code
> > last.
> 
> Yes this makes sense, I'll do that in v2.  Thanks for the suggestion!

Note before doing a v2 i would really like to see some proof of why
you need new page table flag see my reply to:
    userfaultfd: wp: add WP pagetable tracking to x86

As i believe you can identify COW or KSM from UFD write protect with-
out a pte flag.

Cheers,
Jérôme
Peter Xu Jan. 24, 2019, 5:47 a.m. UTC | #8
On Tue, Jan 22, 2019 at 09:43:38PM -0500, Jerome Glisse wrote:
> On Wed, Jan 23, 2019 at 10:17:45AM +0800, Peter Xu wrote:
> > On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote:
> > > On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote:
> > > > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > > +	change_protection(dst_vma, start, start + len, newprot,
> > > > > > +				!enable_wp, 0);
> > > > > 
> > > > > So setting dirty_accountable bring us to that code in mprotect.c:
> > > > > 
> > > > >     if (dirty_accountable && pte_dirty(ptent) &&
> > > > >             (pte_soft_dirty(ptent) ||
> > > > >              !(vma->vm_flags & VM_SOFTDIRTY))) {
> > > > >         ptent = pte_mkwrite(ptent);
> > > > >     }
> > > > > 
> > > > > My understanding is that you want to set write flag when enable_wp
> > > > > is false and you want to set the write flag unconditionaly, right ?
> > > > 
> > > > Right.
> > > > 
> > > > > 
> > > > > If so then you should really move the change_protection() flags
> > > > > patch before this patch and add a flag for setting pte write flags.
> > > > > 
> > > > > Otherwise the above is broken at it will only set the write flag
> > > > > for pte that were dirty and i am guessing so far you always were
> > > > > lucky because pte were all dirty (change_protection will preserve
> > > > > dirtyness) when you write protected them.
> > > > > 
> > > > > So i believe the above is broken or at very least unclear if what
> > > > > you really want is to only set write flag to pte that have the
> > > > > dirty flag set.
> > > > 
> > > > You are right, if we build the tree until this patch it won't work for
> > > > all the cases.  It'll only work if the page was at least writable
> > > > before and also it's dirty (as you explained).  Sorry to be unclear
> > > > about this, maybe I should at least mention that in the commit message
> > > > but I totally forgot it.
> > > > 
> > > > All these problems are solved in later on patches, please feel free to
> > > > have a look at:
> > > > 
> > > >   mm: merge parameters for change_protection()
> > > >   userfaultfd: wp: apply _PAGE_UFFD_WP bit
> > > >   userfaultfd: wp: handle COW properly for uffd-wp
> > > > 
> > > > Note that even in the follow up patches IMHO we can't directly change
> > > > the write permission since the page can be shared by other processes
> > > > (e.g., the zero page or COW pages).  But the general idea is the same
> > > > as you explained.
> > > > 
> > > > I tried to avoid squashing these stuff altogether as explained
> > > > previously.  Also, this patch can be seen as a standalone patch to
> > > > introduce the new interface which seems to make sense too, and it is
> > > > indeed still working in many cases so I see the latter patches as
> > > > enhancement of this one.  Please let me know if you still want me to
> > > > have all these stuff squashed, or if you'd like me to squash some of
> > > > them.
> > > 
> > > Yeah i have look at those after looking at this one. You should just
> > > re-order the patch this one first and then one that add new flag,
> > > then ones that add the new userfaultfd feature. Otherwise you are
> > > adding a userfaultfd feature that is broken midway ie it is added
> > > broken and then you fix it. Some one bisecting thing might get hurt
> > > by that. It is better to add and change everything you need and then
> > > add the new feature so that the new feature will work as intended.
> > > 
> > > So no squashing just change the order ie add the userfaultfd code
> > > last.
> > 
> > Yes this makes sense, I'll do that in v2.  Thanks for the suggestion!
> 
> Note before doing a v2 i would really like to see some proof of why
> you need new page table flag see my reply to:
>     userfaultfd: wp: add WP pagetable tracking to x86
> 
> As i believe you can identify COW or KSM from UFD write protect with-
> out a pte flag.

Yes.  I replied in that thread with my understanding on why the new
bit is required in the PTE (and also another new bit in the swap
entry).  We can discuss there.

Thanks,
diff mbox series

Patch

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 38f748e7186e..e82f3156f4e9 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -37,6 +37,8 @@  extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len,
 			      bool *mmap_changing);
+extern int mwriteprotect_range(struct mm_struct *dst_mm,
+		unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..c38903f501c7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -615,3 +615,55 @@  ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 {
 	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
 }
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+	unsigned long len, bool enable_wp)
+{
+	struct vm_area_struct *dst_vma;
+	pgprot_t newprot;
+	int err;
+
+	/*
+	 * Sanitize the command parameters:
+	 */
+	BUG_ON(start & ~PAGE_MASK);
+	BUG_ON(len & ~PAGE_MASK);
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	BUG_ON(start + len <= start);
+
+	down_read(&dst_mm->mmap_sem);
+
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	err = -EINVAL;
+	dst_vma = find_vma(dst_mm, start);
+	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+		goto out_unlock;
+	if (start < dst_vma->vm_start ||
+	    start + len > dst_vma->vm_end)
+		goto out_unlock;
+
+	if (!dst_vma->vm_userfaultfd_ctx.ctx)
+		goto out_unlock;
+	if (!userfaultfd_wp(dst_vma))
+		goto out_unlock;
+
+	if (!vma_is_anonymous(dst_vma))
+		goto out_unlock;
+
+	if (enable_wp)
+		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+	else
+		newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+	change_protection(dst_vma, start, start + len, newprot,
+				!enable_wp, 0);
+
+	err = 0;
+out_unlock:
+	up_read(&dst_mm->mmap_sem);
+	return err;
+}