diff mbox series

[v2,21/26] userfaultfd: wp: add the writeprotect API to userfaultfd ioctl

Message ID 20190212025632.28946-22-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Feb. 12, 2019, 2:56 a.m. UTC
From: Andrea Arcangeli <aarcange@redhat.com>

v1: From: Shaohua Li <shli@fb.com>

v2: cleanups, remove a branch.

[peterx writes up the commit message, as below...]

This patch introduces the new uffd-wp APIs for userspace.

Firstly, we'll allow to do UFFDIO_REGISTER with write protection
tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
which case the userspace program can not only resolve missing page
faults, and at the same time tracking page data changes along the way.

Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
level write protection tracking.  Note that we will need to register
the memory region with UFFDIO_REGISTER_MODE_WP before that.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
[peterx: remove useless block, write commit message, check against
 VM_MAYWRITE rather than VM_WRITE when register]
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/userfaultfd.c                 | 82 +++++++++++++++++++++++++-------
 include/uapi/linux/userfaultfd.h | 11 +++++
 2 files changed, 77 insertions(+), 16 deletions(-)

Comments

Jerome Glisse Feb. 21, 2019, 6:28 p.m. UTC | #1
On Tue, Feb 12, 2019 at 10:56:27AM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> v1: From: Shaohua Li <shli@fb.com>
> 
> v2: cleanups, remove a branch.
> 
> [peterx writes up the commit message, as below...]
> 
> This patch introduces the new uffd-wp APIs for userspace.
> 
> Firstly, we'll allow to do UFFDIO_REGISTER with write protection
> tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
> flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
> which case the userspace program can not only resolve missing page
> faults, and at the same time tracking page data changes along the way.
> 
> Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
> level write protection tracking.  Note that we will need to register
> the memory region with UFFDIO_REGISTER_MODE_WP before that.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> [peterx: remove useless block, write commit message, check against
>  VM_MAYWRITE rather than VM_WRITE when register]
> Signed-off-by: Peter Xu <peterx@redhat.com>

I am not an expert with userfaultfd code but it looks good to me so:

Also see my question down below, just a minor one.

Reviewed-by: Jérôme Glisse <jglisse@redhat.com>

> ---
>  fs/userfaultfd.c                 | 82 +++++++++++++++++++++++++-------
>  include/uapi/linux/userfaultfd.h | 11 +++++
>  2 files changed, 77 insertions(+), 16 deletions(-)
> 

[...]

> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 297cb044c03f..1b977a7a4435 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -52,6 +52,7 @@
>  #define _UFFDIO_WAKE			(0x02)
>  #define _UFFDIO_COPY			(0x03)
>  #define _UFFDIO_ZEROPAGE		(0x04)
> +#define _UFFDIO_WRITEPROTECT		(0x06)
>  #define _UFFDIO_API			(0x3F)

What did happen to ioctl 0x05 ? :)
Peter Xu Feb. 25, 2019, 8:31 a.m. UTC | #2
On Thu, Feb 21, 2019 at 01:28:25PM -0500, Jerome Glisse wrote:
> On Tue, Feb 12, 2019 at 10:56:27AM +0800, Peter Xu wrote:
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > v1: From: Shaohua Li <shli@fb.com>
> > 
> > v2: cleanups, remove a branch.
> > 
> > [peterx writes up the commit message, as below...]
> > 
> > This patch introduces the new uffd-wp APIs for userspace.
> > 
> > Firstly, we'll allow to do UFFDIO_REGISTER with write protection
> > tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
> > flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
> > which case the userspace program can not only resolve missing page
> > faults, and at the same time tracking page data changes along the way.
> > 
> > Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
> > level write protection tracking.  Note that we will need to register
> > the memory region with UFFDIO_REGISTER_MODE_WP before that.
> > 
> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> > [peterx: remove useless block, write commit message, check against
> >  VM_MAYWRITE rather than VM_WRITE when register]
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> I am not an expert with userfaultfd code but it looks good to me so:
> 
> Also see my question down below, just a minor one.
> 
> Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
> 
> > ---
> >  fs/userfaultfd.c                 | 82 +++++++++++++++++++++++++-------
> >  include/uapi/linux/userfaultfd.h | 11 +++++
> >  2 files changed, 77 insertions(+), 16 deletions(-)
> > 
> 
> [...]
> 
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 297cb044c03f..1b977a7a4435 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -52,6 +52,7 @@
> >  #define _UFFDIO_WAKE			(0x02)
> >  #define _UFFDIO_COPY			(0x03)
> >  #define _UFFDIO_ZEROPAGE		(0x04)
> > +#define _UFFDIO_WRITEPROTECT		(0x06)
> >  #define _UFFDIO_API			(0x3F)
> 
> What did happen to ioctl 0x05 ? :)

It simply because it was 0x06 in Andrea's tree. :-)

https://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git/commit/?h=userfault&id=ad0c3bec9897d8c8617ecaeb3110d3bdf884b15c

Andrea introduced _UFFDIO_REMAP first in his original work which took
0x05 (hmm... not really the "very" original, but the one after
Shaohua's work) then _UFFDIO_WRITEPROTECT which took 0x06.  I'm afraid
there's already userspace programs that have linked with that tree and
the numbers (I believe LLNL and umap is one of them, people may not
start to use it very seriesly but still they can be distributed and
start doing some real work...).  I'm using the same number here
considering that it might be good to simply even don't break any of
the experimental programs if it's easy to achieve (for either existing
uffd-wp but also the new remap interface users if there is), after all
these numbers are really adhoc for us.  If anyone doesn't like this I
can for sure switch to 0x05 again if that looks cuter.

Thanks,
Mike Rapoport Feb. 25, 2019, 9:03 p.m. UTC | #3
On Tue, Feb 12, 2019 at 10:56:27AM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> v1: From: Shaohua Li <shli@fb.com>
> 
> v2: cleanups, remove a branch.
> 
> [peterx writes up the commit message, as below...]
> 
> This patch introduces the new uffd-wp APIs for userspace.
> 
> Firstly, we'll allow to do UFFDIO_REGISTER with write protection
> tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
> flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
> which case the userspace program can not only resolve missing page
> faults, and at the same time tracking page data changes along the way.
> 
> Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
> level write protection tracking.  Note that we will need to register
> the memory region with UFFDIO_REGISTER_MODE_WP before that.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> [peterx: remove useless block, write commit message, check against
>  VM_MAYWRITE rather than VM_WRITE when register]
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c                 | 82 +++++++++++++++++++++++++-------
>  include/uapi/linux/userfaultfd.h | 11 +++++
>  2 files changed, 77 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 3092885c9d2c..81962d62520c 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -304,8 +304,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
>  	if (!pmd_present(_pmd))
>  		goto out;
> 
> -	if (pmd_trans_huge(_pmd))
> +	if (pmd_trans_huge(_pmd)) {
> +		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
> +			ret = true;
>  		goto out;
> +	}
> 
>  	/*
>  	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
> @@ -318,6 +321,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
>  	 */
>  	if (pte_none(*pte))
>  		ret = true;
> +	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
> +		ret = true;
>  	pte_unmap(pte);
> 
>  out:
> @@ -1251,10 +1256,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
>  	return 0;
>  }
> 
> -static inline bool vma_can_userfault(struct vm_area_struct *vma)
> +static inline bool vma_can_userfault(struct vm_area_struct *vma,
> +				     unsigned long vm_flags)
>  {
> -	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> -		vma_is_shmem(vma);
> +	/* FIXME: add WP support to hugetlbfs and shmem */
> +	return vma_is_anonymous(vma) ||
> +		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
> +		 !(vm_flags & VM_UFFD_WP));
>  }
> 
>  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> @@ -1286,15 +1294,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	vm_flags = 0;
>  	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
>  		vm_flags |= VM_UFFD_MISSING;
> -	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
> +	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
>  		vm_flags |= VM_UFFD_WP;
> -		/*
> -		 * FIXME: remove the below error constraint by
> -		 * implementing the wprotect tracking mode.
> -		 */
> -		ret = -EINVAL;
> -		goto out;
> -	}
> 
>  	ret = validate_range(mm, uffdio_register.range.start,
>  			     uffdio_register.range.len);
> @@ -1342,7 +1343,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> 
>  		/* check not compatible vmas */
>  		ret = -EINVAL;
> -		if (!vma_can_userfault(cur))
> +		if (!vma_can_userfault(cur, vm_flags))
>  			goto out_unlock;
> 
>  		/*
> @@ -1370,6 +1371,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  			if (end & (vma_hpagesize - 1))
>  				goto out_unlock;
>  		}
> +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
> +			goto out_unlock;
> 
>  		/*
>  		 * Check that this vma isn't already owned by a
> @@ -1399,7 +1402,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	do {
>  		cond_resched();
> 
> -		BUG_ON(!vma_can_userfault(vma));
> +		BUG_ON(!vma_can_userfault(vma, vm_flags));
>  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
>  		       vma->vm_userfaultfd_ctx.ctx != ctx);
>  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> @@ -1534,7 +1537,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  		 * provides for more strict behavior to notice
>  		 * unregistration errors.
>  		 */
> -		if (!vma_can_userfault(cur))
> +		if (!vma_can_userfault(cur, cur->vm_flags))
>  			goto out_unlock;
> 
>  		found = true;
> @@ -1548,7 +1551,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	do {
>  		cond_resched();
> 
> -		BUG_ON(!vma_can_userfault(vma));
> +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> 
>  		/*
>  		 * Nothing to do: this vma is already registered into this
> @@ -1761,6 +1764,50 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  	return ret;
>  }
> 
> +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> +				    unsigned long arg)
> +{
> +	int ret;
> +	struct uffdio_writeprotect uffdio_wp;
> +	struct uffdio_writeprotect __user *user_uffdio_wp;
> +	struct userfaultfd_wake_range range;
> +
> +	if (READ_ONCE(ctx->mmap_changing))
> +		return -EAGAIN;
> +
> +	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
> +
> +	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
> +			   sizeof(struct uffdio_writeprotect)))
> +		return -EFAULT;
> +
> +	ret = validate_range(ctx->mm, uffdio_wp.range.start,
> +			     uffdio_wp.range.len);
> +	if (ret)
> +		return ret;
> +
> +	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> +			       UFFDIO_WRITEPROTECT_MODE_WP))
> +		return -EINVAL;
> +	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> +	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> +		return -EINVAL;

Why _DONTWAKE cannot be used when setting write-protection?
I can imagine a use-case when you'd want to freeze an application,
write-protect several regions and then let the application continue.

> +
> +	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> +				  uffdio_wp.range.len, uffdio_wp.mode &
> +				  UFFDIO_WRITEPROTECT_MODE_WP,
> +				  &ctx->mmap_changing);
> +	if (ret)
> +		return ret;
> +
> +	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> +		range.start = uffdio_wp.range.start;
> +		range.len = uffdio_wp.range.len;
> +		wake_userfault(ctx, &range);
> +	}
> +	return ret;
> +}
> +
>  static inline unsigned int uffd_ctx_features(__u64 user_features)
>  {
>  	/*
> @@ -1838,6 +1885,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
>  	case UFFDIO_ZEROPAGE:
>  		ret = userfaultfd_zeropage(ctx, arg);
>  		break;
> +	case UFFDIO_WRITEPROTECT:
> +		ret = userfaultfd_writeprotect(ctx, arg);
> +		break;
>  	}
>  	return ret;
>  }
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 297cb044c03f..1b977a7a4435 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -52,6 +52,7 @@
>  #define _UFFDIO_WAKE			(0x02)
>  #define _UFFDIO_COPY			(0x03)
>  #define _UFFDIO_ZEROPAGE		(0x04)
> +#define _UFFDIO_WRITEPROTECT		(0x06)
>  #define _UFFDIO_API			(0x3F)
> 
>  /* userfaultfd ioctl ids */
> @@ -68,6 +69,8 @@
>  				      struct uffdio_copy)
>  #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
>  				      struct uffdio_zeropage)
> +#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
> +				      struct uffdio_writeprotect)
> 
>  /* read() structure */
>  struct uffd_msg {
> @@ -232,4 +235,12 @@ struct uffdio_zeropage {
>  	__s64 zeropage;
>  };
> 
> +struct uffdio_writeprotect {
> +	struct uffdio_range range;
> +	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> +#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> +	__u64 mode;
> +};
> +
>  #endif /* _LINUX_USERFAULTFD_H */
> -- 
> 2.17.1
>
Peter Xu Feb. 26, 2019, 6:30 a.m. UTC | #4
On Mon, Feb 25, 2019 at 11:03:51PM +0200, Mike Rapoport wrote:
> On Tue, Feb 12, 2019 at 10:56:27AM +0800, Peter Xu wrote:
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > v1: From: Shaohua Li <shli@fb.com>
> > 
> > v2: cleanups, remove a branch.
> > 
> > [peterx writes up the commit message, as below...]
> > 
> > This patch introduces the new uffd-wp APIs for userspace.
> > 
> > Firstly, we'll allow to do UFFDIO_REGISTER with write protection
> > tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
> > flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
> > which case the userspace program can not only resolve missing page
> > faults, and at the same time tracking page data changes along the way.
> > 
> > Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
> > level write protection tracking.  Note that we will need to register
> > the memory region with UFFDIO_REGISTER_MODE_WP before that.
> > 
> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> > [peterx: remove useless block, write commit message, check against
> >  VM_MAYWRITE rather than VM_WRITE when register]
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c                 | 82 +++++++++++++++++++++++++-------
> >  include/uapi/linux/userfaultfd.h | 11 +++++
> >  2 files changed, 77 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 3092885c9d2c..81962d62520c 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -304,8 +304,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> >  	if (!pmd_present(_pmd))
> >  		goto out;
> > 
> > -	if (pmd_trans_huge(_pmd))
> > +	if (pmd_trans_huge(_pmd)) {
> > +		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
> > +			ret = true;
> >  		goto out;
> > +	}
> > 
> >  	/*
> >  	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
> > @@ -318,6 +321,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> >  	 */
> >  	if (pte_none(*pte))
> >  		ret = true;
> > +	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
> > +		ret = true;
> >  	pte_unmap(pte);
> > 
> >  out:
> > @@ -1251,10 +1256,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
> >  	return 0;
> >  }
> > 
> > -static inline bool vma_can_userfault(struct vm_area_struct *vma)
> > +static inline bool vma_can_userfault(struct vm_area_struct *vma,
> > +				     unsigned long vm_flags)
> >  {
> > -	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> > -		vma_is_shmem(vma);
> > +	/* FIXME: add WP support to hugetlbfs and shmem */
> > +	return vma_is_anonymous(vma) ||
> > +		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
> > +		 !(vm_flags & VM_UFFD_WP));
> >  }
> > 
> >  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > @@ -1286,15 +1294,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	vm_flags = 0;
> >  	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
> >  		vm_flags |= VM_UFFD_MISSING;
> > -	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
> > +	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
> >  		vm_flags |= VM_UFFD_WP;
> > -		/*
> > -		 * FIXME: remove the below error constraint by
> > -		 * implementing the wprotect tracking mode.
> > -		 */
> > -		ret = -EINVAL;
> > -		goto out;
> > -	}
> > 
> >  	ret = validate_range(mm, uffdio_register.range.start,
> >  			     uffdio_register.range.len);
> > @@ -1342,7 +1343,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > 
> >  		/* check not compatible vmas */
> >  		ret = -EINVAL;
> > -		if (!vma_can_userfault(cur))
> > +		if (!vma_can_userfault(cur, vm_flags))
> >  			goto out_unlock;
> > 
> >  		/*
> > @@ -1370,6 +1371,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  			if (end & (vma_hpagesize - 1))
> >  				goto out_unlock;
> >  		}
> > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
> > +			goto out_unlock;
> > 
> >  		/*
> >  		 * Check that this vma isn't already owned by a
> > @@ -1399,7 +1402,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	do {
> >  		cond_resched();
> > 
> > -		BUG_ON(!vma_can_userfault(vma));
> > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > @@ -1534,7 +1537,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  		 * provides for more strict behavior to notice
> >  		 * unregistration errors.
> >  		 */
> > -		if (!vma_can_userfault(cur))
> > +		if (!vma_can_userfault(cur, cur->vm_flags))
> >  			goto out_unlock;
> > 
> >  		found = true;
> > @@ -1548,7 +1551,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	do {
> >  		cond_resched();
> > 
> > -		BUG_ON(!vma_can_userfault(vma));
> > +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > 
> >  		/*
> >  		 * Nothing to do: this vma is already registered into this
> > @@ -1761,6 +1764,50 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> >  	return ret;
> >  }
> > 
> > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > +				    unsigned long arg)
> > +{
> > +	int ret;
> > +	struct uffdio_writeprotect uffdio_wp;
> > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > +	struct userfaultfd_wake_range range;
> > +
> > +	if (READ_ONCE(ctx->mmap_changing))
> > +		return -EAGAIN;
> > +
> > +	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
> > +
> > +	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
> > +			   sizeof(struct uffdio_writeprotect)))
> > +		return -EFAULT;
> > +
> > +	ret = validate_range(ctx->mm, uffdio_wp.range.start,
> > +			     uffdio_wp.range.len);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > +			       UFFDIO_WRITEPROTECT_MODE_WP))
> > +		return -EINVAL;
> > +	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > +	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > +		return -EINVAL;
> 
> Why _DONTWAKE cannot be used when setting write-protection?
> I can imagine a use-case when you'd want to freeze an application,
> write-protect several regions and then let the application continue.

This is the same question as the one in the other thread, which I've
had a longer reply there, hope it could be a bit clearer (sorry for
the confusion no matter what!).  I would be more than glad to know if
there could be any smarter way to define/renaming/... the flags.

Thanks!
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 3092885c9d2c..81962d62520c 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -304,8 +304,11 @@  static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	if (!pmd_present(_pmd))
 		goto out;
 
-	if (pmd_trans_huge(_pmd))
+	if (pmd_trans_huge(_pmd)) {
+		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+			ret = true;
 		goto out;
+	}
 
 	/*
 	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -318,6 +321,8 @@  static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	 */
 	if (pte_none(*pte))
 		ret = true;
+	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
+		ret = true;
 	pte_unmap(pte);
 
 out:
@@ -1251,10 +1256,13 @@  static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+				     unsigned long vm_flags)
 {
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
-		vma_is_shmem(vma);
+	/* FIXME: add WP support to hugetlbfs and shmem */
+	return vma_is_anonymous(vma) ||
+		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
+		 !(vm_flags & VM_UFFD_WP));
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1286,15 +1294,8 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	vm_flags = 0;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 		vm_flags |= VM_UFFD_MISSING;
-	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
 		vm_flags |= VM_UFFD_WP;
-		/*
-		 * FIXME: remove the below error constraint by
-		 * implementing the wprotect tracking mode.
-		 */
-		ret = -EINVAL;
-		goto out;
-	}
 
 	ret = validate_range(mm, uffdio_register.range.start,
 			     uffdio_register.range.len);
@@ -1342,7 +1343,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_can_userfault(cur))
+		if (!vma_can_userfault(cur, vm_flags))
 			goto out_unlock;
 
 		/*
@@ -1370,6 +1371,8 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 			if (end & (vma_hpagesize - 1))
 				goto out_unlock;
 		}
+		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
+			goto out_unlock;
 
 		/*
 		 * Check that this vma isn't already owned by a
@@ -1399,7 +1402,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma));
+		BUG_ON(!vma_can_userfault(vma, vm_flags));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1534,7 +1537,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_can_userfault(cur))
+		if (!vma_can_userfault(cur, cur->vm_flags))
 			goto out_unlock;
 
 		found = true;
@@ -1548,7 +1551,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma));
+		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -1761,6 +1764,50 @@  static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+				    unsigned long arg)
+{
+	int ret;
+	struct uffdio_writeprotect uffdio_wp;
+	struct uffdio_writeprotect __user *user_uffdio_wp;
+	struct userfaultfd_wake_range range;
+
+	if (READ_ONCE(ctx->mmap_changing))
+		return -EAGAIN;
+
+	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+			   sizeof(struct uffdio_writeprotect)))
+		return -EFAULT;
+
+	ret = validate_range(ctx->mm, uffdio_wp.range.start,
+			     uffdio_wp.range.len);
+	if (ret)
+		return ret;
+
+	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+			       UFFDIO_WRITEPROTECT_MODE_WP))
+		return -EINVAL;
+	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
+	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
+		return -EINVAL;
+
+	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+				  uffdio_wp.range.len, uffdio_wp.mode &
+				  UFFDIO_WRITEPROTECT_MODE_WP,
+				  &ctx->mmap_changing);
+	if (ret)
+		return ret;
+
+	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
+		range.start = uffdio_wp.range.start;
+		range.len = uffdio_wp.range.len;
+		wake_userfault(ctx, &range);
+	}
+	return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1838,6 +1885,9 @@  static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_ZEROPAGE:
 		ret = userfaultfd_zeropage(ctx, arg);
 		break;
+	case UFFDIO_WRITEPROTECT:
+		ret = userfaultfd_writeprotect(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 297cb044c03f..1b977a7a4435 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -52,6 +52,7 @@ 
 #define _UFFDIO_WAKE			(0x02)
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_WRITEPROTECT		(0x06)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -68,6 +69,8 @@ 
 				      struct uffdio_copy)
 #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 				      struct uffdio_zeropage)
+#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
+				      struct uffdio_writeprotect)
 
 /* read() structure */
 struct uffd_msg {
@@ -232,4 +235,12 @@  struct uffdio_zeropage {
 	__s64 zeropage;
 };
 
+struct uffdio_writeprotect {
+	struct uffdio_range range;
+	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
+#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
+#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
+	__u64 mode;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */