diff mbox series

[RFC,07/24] userfaultfd: wp: add the writeprotect API to userfaultfd ioctl

Message ID 20190121075722.7945-8-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Jan. 21, 2019, 7:57 a.m. UTC
From: Andrea Arcangeli <aarcange@redhat.com>

v1: From: Shaohua Li <shli@fb.com>

v2: cleanups, remove a branch.

[peterx writes up the commit message, as below...]

This patch introduces the new uffd-wp APIs for userspace.

Firstly, we'll allow to do UFFDIO_REGISTER with write protection
tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
which case the userspace program can not only resolve missing page
faults, and at the same time tracking page data changes along the way.

Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
level write protection tracking.  Note that we will need to register
the memory region with UFFDIO_REGISTER_MODE_WP before that.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
[peterx: remove useless block, write commit message]
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/userfaultfd.c                 | 78 +++++++++++++++++++++++++-------
 include/uapi/linux/userfaultfd.h | 11 +++++
 2 files changed, 73 insertions(+), 16 deletions(-)

Comments

Mike Rapoport Jan. 21, 2019, 10:42 a.m. UTC | #1
On Mon, Jan 21, 2019 at 03:57:05PM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> v1: From: Shaohua Li <shli@fb.com>
> 
> v2: cleanups, remove a branch.
> 
> [peterx writes up the commit message, as below...]
> 
> This patch introduces the new uffd-wp APIs for userspace.
> 
> Firstly, we'll allow to do UFFDIO_REGISTER with write protection
> tracking using the new UFFDIO_REGISTER_MODE_WP flag.  Note that this
> flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in
> which case the userspace program can not only resolve missing page
> faults, and at the same time tracking page data changes along the way.
> 
> Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page
> level write protection tracking.  Note that we will need to register
> the memory region with UFFDIO_REGISTER_MODE_WP before that.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> [peterx: remove useless block, write commit message]
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c                 | 78 +++++++++++++++++++++++++-------
>  include/uapi/linux/userfaultfd.h | 11 +++++
>  2 files changed, 73 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index bc9f6230a3f0..6ff8773d6797 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -305,8 +305,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
>  	if (!pmd_present(_pmd))
>  		goto out;
> 
> -	if (pmd_trans_huge(_pmd))
> +	if (pmd_trans_huge(_pmd)) {
> +		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
> +			ret = true;
>  		goto out;
> +	}
> 
>  	/*
>  	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
> @@ -319,6 +322,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
>  	 */
>  	if (pte_none(*pte))
>  		ret = true;
> +	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
> +		ret = true;
>  	pte_unmap(pte);
> 
>  out:
> @@ -1252,10 +1257,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
>  	return 0;
>  }
> 
> -static inline bool vma_can_userfault(struct vm_area_struct *vma)
> +static inline bool vma_can_userfault(struct vm_area_struct *vma,
> +				     unsigned long vm_flags)
>  {
> -	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> -		vma_is_shmem(vma);
> +	/* FIXME: add WP support to hugetlbfs and shmem */
> +	return vma_is_anonymous(vma) ||
> +		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
> +		 !(vm_flags & VM_UFFD_WP));
>  }
> 
>  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> @@ -1287,15 +1295,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	vm_flags = 0;
>  	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
>  		vm_flags |= VM_UFFD_MISSING;
> -	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
> +	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
>  		vm_flags |= VM_UFFD_WP;
> -		/*
> -		 * FIXME: remove the below error constraint by
> -		 * implementing the wprotect tracking mode.
> -		 */
> -		ret = -EINVAL;
> -		goto out;
> -	}
> 
>  	ret = validate_range(mm, uffdio_register.range.start,
>  			     uffdio_register.range.len);
> @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> 
>  		/* check not compatible vmas */
>  		ret = -EINVAL;
> -		if (!vma_can_userfault(cur))
> +		if (!vma_can_userfault(cur, vm_flags))
>  			goto out_unlock;
> 
>  		/*
> @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  			if (end & (vma_hpagesize - 1))
>  				goto out_unlock;
>  		}
> +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> +			goto out_unlock;

This is problematic for the non-cooperative use-case. Way may still want to
monitor a read-only area because it may eventually become writable, e.g. if
the monitored process runs mprotect().
Particularity, for using uffd-wp as a replacement for soft-dirty would
require it.

> 
>  		/*
>  		 * Check that this vma isn't already owned by a
> @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	do {
>  		cond_resched();
> 
> -		BUG_ON(!vma_can_userfault(vma));
> +		BUG_ON(!vma_can_userfault(vma, vm_flags));
>  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
>  		       vma->vm_userfaultfd_ctx.ctx != ctx);
>  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> @@ -1535,7 +1538,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  		 * provides for more strict behavior to notice
>  		 * unregistration errors.
>  		 */
> -		if (!vma_can_userfault(cur))
> +		if (!vma_can_userfault(cur, cur->vm_flags))
>  			goto out_unlock;
> 
>  		found = true;
> @@ -1549,7 +1552,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	do {
>  		cond_resched();
> 
> -		BUG_ON(!vma_can_userfault(vma));
> +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
>  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> 
>  		/*
> @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  	return ret;
>  }
> 
> +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> +				    unsigned long arg)
> +{
> +	int ret;
> +	struct uffdio_writeprotect uffdio_wp;
> +	struct uffdio_writeprotect __user *user_uffdio_wp;
> +	struct userfaultfd_wake_range range;
> +

In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
was to return -EAGAIN if such race is encountered. I think the same would
apply here.

> +	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
> +
> +	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
> +			   sizeof(struct uffdio_writeprotect)))
> +		return -EFAULT;
> +
> +	ret = validate_range(ctx->mm, uffdio_wp.range.start,
> +			     uffdio_wp.range.len);
> +	if (ret)
> +		return ret;
> +
> +	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> +			       UFFDIO_WRITEPROTECT_MODE_WP))
> +		return -EINVAL;
> +	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> +	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> +		return -EINVAL;
> +
> +	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> +				  uffdio_wp.range.len, uffdio_wp.mode &
> +				  UFFDIO_WRITEPROTECT_MODE_WP);
> +	if (ret)
> +		return ret;
> +
> +	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> +		range.start = uffdio_wp.range.start;
> +		range.len = uffdio_wp.range.len;
> +		wake_userfault(ctx, &range);
> +	}
> +	return ret;
> +}
> +
>  static inline unsigned int uffd_ctx_features(__u64 user_features)
>  {
>  	/*
> @@ -1837,6 +1880,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
>  	case UFFDIO_ZEROPAGE:
>  		ret = userfaultfd_zeropage(ctx, arg);
>  		break;
> +	case UFFDIO_WRITEPROTECT:
> +		ret = userfaultfd_writeprotect(ctx, arg);
> +		break;
>  	}
>  	return ret;
>  }
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 48f1a7c2f1f0..11517f796275 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -52,6 +52,7 @@
>  #define _UFFDIO_WAKE			(0x02)
>  #define _UFFDIO_COPY			(0x03)
>  #define _UFFDIO_ZEROPAGE		(0x04)
> +#define _UFFDIO_WRITEPROTECT		(0x06)
>  #define _UFFDIO_API			(0x3F)
> 
>  /* userfaultfd ioctl ids */
> @@ -68,6 +69,8 @@
>  				      struct uffdio_copy)
>  #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
>  				      struct uffdio_zeropage)
> +#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
> +				      struct uffdio_writeprotect)
> 
>  /* read() structure */
>  struct uffd_msg {
> @@ -231,4 +234,12 @@ struct uffdio_zeropage {
>  	__s64 zeropage;
>  };
> 
> +struct uffdio_writeprotect {
> +	struct uffdio_range range;
> +	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> +#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> +	__u64 mode;
> +};
> +
>  #endif /* _LINUX_USERFAULTFD_H */
> -- 
> 2.17.1
 
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df2cc96e77011cf7989208b206da9817e0321028
Peter Xu Jan. 24, 2019, 4:56 a.m. UTC | #2
On Mon, Jan 21, 2019 at 12:42:33PM +0200, Mike Rapoport wrote:

[...]

> > @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > 
> >  		/* check not compatible vmas */
> >  		ret = -EINVAL;
> > -		if (!vma_can_userfault(cur))
> > +		if (!vma_can_userfault(cur, vm_flags))
> >  			goto out_unlock;
> > 
> >  		/*
> > @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  			if (end & (vma_hpagesize - 1))
> >  				goto out_unlock;
> >  		}
> > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> > +			goto out_unlock;
> 
> This is problematic for the non-cooperative use-case. Way may still want to
> monitor a read-only area because it may eventually become writable, e.g. if
> the monitored process runs mprotect().

Firstly I think I should be able to change it to VM_MAYWRITE which
seems to suite more.

Meanwhile, frankly speaking I didn't think a lot about how to nest the
usages of uffd-wp and mprotect(), so far I was only considering it as
a replacement of mprotect().  But indeed it can happen that the
monitored process calls mprotect().  Is there an existing scenario of
such usage?

The problem is I'm uncertain about whether this scenario can work
after all.  Say, the monitor process A write protected process B's
page P, so logically A will definitely receive a message before B
writes to page P.  However here if we allow process B to do
mprotect(PROT_WRITE) upon page P and grant write permission to it on
its own, then A will not be able to capture the write operation at
all?  Then I don't know how it can work here... or whether we should
fail the mprotect() at least upon uffd-wp ranges?

> Particularity, for using uffd-wp as a replacement for soft-dirty would
> require it.
> 
> > 
> >  		/*
> >  		 * Check that this vma isn't already owned by a
> > @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	do {
> >  		cond_resched();
> > 
> > -		BUG_ON(!vma_can_userfault(vma));
> > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > @@ -1535,7 +1538,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  		 * provides for more strict behavior to notice
> >  		 * unregistration errors.
> >  		 */
> > -		if (!vma_can_userfault(cur))
> > +		if (!vma_can_userfault(cur, cur->vm_flags))
> >  			goto out_unlock;
> > 
> >  		found = true;
> > @@ -1549,7 +1552,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	do {
> >  		cond_resched();
> > 
> > -		BUG_ON(!vma_can_userfault(vma));
> > +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > 
> >  		/*
> > @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> >  	return ret;
> >  }
> > 
> > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > +				    unsigned long arg)
> > +{
> > +	int ret;
> > +	struct uffdio_writeprotect uffdio_wp;
> > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > +	struct userfaultfd_wake_range range;
> > +
> 
> In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
> layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
> was to return -EAGAIN if such race is encountered. I think the same would
> apply here.

I tried to understand the problem at [1] but failed... could you help
to clarify it a bit more?

I'm quoting some of the discussions from [1] here directly between you
and Pavel:

  > Since the monitor cannot assume that the process will access all its memory
  > it has to copy some pages "in the background". A simple monitor may look
  > like:
  > 
  > 	for (;;) {
  > 		wait_for_uffd_events(timeout);
  > 		handle_uffd_events();
  > 		uffd_copy(some not faulted pages);
  > 	}
  > 
  > Then, if the "background" uffd_copy() races with fork, the pages we've
  > copied may be already present in parent's mappings before the call to
  > copy_page_range() and may be not.
  > 
  > If the pages were not present, uffd_copy'ing them again to the child's
  > memory would be ok.
  >
  > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
  > again, child process will get memory corruption.

Here I don't understand why the child process will get memory
corruption if uffd_copy() caught the mmap_sem first.

If it did it, then IMHO when uffd_copy() copies the page again it'll
simply get a -EEXIST showing that the page has already been copied.
Could you explain on why there will be a data corruption?

Thanks in advance,

>  
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df2cc96e77011cf7989208b206da9817e0321028
>
Mike Rapoport Jan. 24, 2019, 7:27 a.m. UTC | #3
On Thu, Jan 24, 2019 at 12:56:15PM +0800, Peter Xu wrote:
> On Mon, Jan 21, 2019 at 12:42:33PM +0200, Mike Rapoport wrote:
> 
> [...]
> 
> > > @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > 
> > >  		/* check not compatible vmas */
> > >  		ret = -EINVAL;
> > > -		if (!vma_can_userfault(cur))
> > > +		if (!vma_can_userfault(cur, vm_flags))
> > >  			goto out_unlock;
> > > 
> > >  		/*
> > > @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  			if (end & (vma_hpagesize - 1))
> > >  				goto out_unlock;
> > >  		}
> > > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> > > +			goto out_unlock;
> > 
> > This is problematic for the non-cooperative use-case. Way may still want to
> > monitor a read-only area because it may eventually become writable, e.g. if
> > the monitored process runs mprotect().
> 
> Firstly I think I should be able to change it to VM_MAYWRITE which
> seems to suite more.
> 
> Meanwhile, frankly speaking I didn't think a lot about how to nest the
> usages of uffd-wp and mprotect(), so far I was only considering it as
> a replacement of mprotect().  But indeed it can happen that the
> monitored process calls mprotect().  Is there an existing scenario of
> such usage?
> 
> The problem is I'm uncertain about whether this scenario can work
> after all.  Say, the monitor process A write protected process B's
> page P, so logically A will definitely receive a message before B
> writes to page P.  However here if we allow process B to do
> mprotect(PROT_WRITE) upon page P and grant write permission to it on
> its own, then A will not be able to capture the write operation at
> all?  Then I don't know how it can work here... or whether we should
> fail the mprotect() at least upon uffd-wp ranges?

The use-case we've discussed a while ago was to use uffd-wp instead of
soft-dirty for tracking memory changes in CRIU for pre-copy migration.
Currently, we enable soft-dirty for the migrated process and monitor
/proc/pid/pagemap between memory dump iterations to see what memory pages
have been changed.
With uffd-wp we thought to register all the process memory with uffd-wp and
then track changes with uffd-wp notifications. Back then it was considered
only at the very general level without paying much attention to details.

So my initial thought was that we do register the entire memory with
uffd-wp. If an area changes from RO to RW at some point, uffd-wp will
generate notifications to the monitor, it would be able to notice the
change and the write will continue normally.

If we are to limit uffd-wp register only to VMAs with VM_WRITE and even
VM_MAYWRITE, we'd need a way to handle the possible changes of VMA
protection and an ability to add monitoring for areas that changed from RO
to RW.

Can't say I have a clear picture in mind at the moment, will continue to
think about it.

> > Particularity, for using uffd-wp as a replacement for soft-dirty would
> > require it.
> > 
> > > 
> > >  		/*
> > >  		 * Check that this vma isn't already owned by a
> > > @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	do {
> > >  		cond_resched();
> > > 
> > > -		BUG_ON(!vma_can_userfault(vma));
> > > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> > >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > @@ -1535,7 +1538,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  		 * provides for more strict behavior to notice
> > >  		 * unregistration errors.
> > >  		 */
> > > -		if (!vma_can_userfault(cur))
> > > +		if (!vma_can_userfault(cur, cur->vm_flags))
> > >  			goto out_unlock;
> > > 
> > >  		found = true;
> > > @@ -1549,7 +1552,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  	do {
> > >  		cond_resched();
> > > 
> > > -		BUG_ON(!vma_can_userfault(vma));
> > > +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > 
> > >  		/*
> > > @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> > >  	return ret;
> > >  }
> > > 
> > > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > +				    unsigned long arg)
> > > +{
> > > +	int ret;
> > > +	struct uffdio_writeprotect uffdio_wp;
> > > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > +	struct userfaultfd_wake_range range;
> > > +
> > 
> > In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
> > layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
> > was to return -EAGAIN if such race is encountered. I think the same would
> > apply here.
> 
> I tried to understand the problem at [1] but failed... could you help
> to clarify it a bit more?
> 
> I'm quoting some of the discussions from [1] here directly between you
> and Pavel:
> 
>   > Since the monitor cannot assume that the process will access all its memory
>   > it has to copy some pages "in the background". A simple monitor may look
>   > like:
>   > 
>   > 	for (;;) {
>   > 		wait_for_uffd_events(timeout);
>   > 		handle_uffd_events();
>   > 		uffd_copy(some not faulted pages);
>   > 	}
>   > 
>   > Then, if the "background" uffd_copy() races with fork, the pages we've
>   > copied may be already present in parent's mappings before the call to
>   > copy_page_range() and may be not.
>   > 
>   > If the pages were not present, uffd_copy'ing them again to the child's
>   > memory would be ok.
>   >
>   > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
>   > again, child process will get memory corruption.
> 
> Here I don't understand why the child process will get memory
> corruption if uffd_copy() caught the mmap_sem first.
> 
> If it did it, then IMHO when uffd_copy() copies the page again it'll
> simply get a -EEXIST showing that the page has already been copied.
> Could you explain on why there will be a data corruption?

Let's say we do post-copy migration of a process A with CRIU and its page at
address 0x1000 is already copied. Now it modifies the contents of this
page. At this point the contents of the page at 0x1000 is different on the
source and the destination.
Next, process A forks process B. The CRIU's uffd monitor gets
UFFD_EVENT_FORK, and starts filling process B memory with UFFDIO_COPY.
It may happen, that UFFDIO_COPY to 0x1000 of the process B will occur
*before* fork() completes and it may race with copy_page_range().
If UFFDIO_COPY wins the race, it will fill the page with the contents from
the source, although the correct data is what process A set in that page.

Hope it helps.

> Thanks in advance,
> 
> >  
> > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df2cc96e77011cf7989208b206da9817e0321028
> >
> 
> -- 
> Peter Xu
>
Peter Xu Jan. 24, 2019, 9:28 a.m. UTC | #4
On Thu, Jan 24, 2019 at 09:27:07AM +0200, Mike Rapoport wrote:
> On Thu, Jan 24, 2019 at 12:56:15PM +0800, Peter Xu wrote:
> > On Mon, Jan 21, 2019 at 12:42:33PM +0200, Mike Rapoport wrote:
> > 
> > [...]
> > 
> > > > @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > 
> > > >  		/* check not compatible vmas */
> > > >  		ret = -EINVAL;
> > > > -		if (!vma_can_userfault(cur))
> > > > +		if (!vma_can_userfault(cur, vm_flags))
> > > >  			goto out_unlock;
> > > > 
> > > >  		/*
> > > > @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  			if (end & (vma_hpagesize - 1))
> > > >  				goto out_unlock;
> > > >  		}
> > > > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> > > > +			goto out_unlock;
> > > 
> > > This is problematic for the non-cooperative use-case. Way may still want to
> > > monitor a read-only area because it may eventually become writable, e.g. if
> > > the monitored process runs mprotect().
> > 
> > Firstly I think I should be able to change it to VM_MAYWRITE which
> > seems to suite more.
> > 
> > Meanwhile, frankly speaking I didn't think a lot about how to nest the
> > usages of uffd-wp and mprotect(), so far I was only considering it as
> > a replacement of mprotect().  But indeed it can happen that the
> > monitored process calls mprotect().  Is there an existing scenario of
> > such usage?
> > 
> > The problem is I'm uncertain about whether this scenario can work
> > after all.  Say, the monitor process A write protected process B's
> > page P, so logically A will definitely receive a message before B
> > writes to page P.  However here if we allow process B to do
> > mprotect(PROT_WRITE) upon page P and grant write permission to it on
> > its own, then A will not be able to capture the write operation at
> > all?  Then I don't know how it can work here... or whether we should
> > fail the mprotect() at least upon uffd-wp ranges?
> 
> The use-case we've discussed a while ago was to use uffd-wp instead of
> soft-dirty for tracking memory changes in CRIU for pre-copy migration.
> Currently, we enable soft-dirty for the migrated process and monitor
> /proc/pid/pagemap between memory dump iterations to see what memory pages
> have been changed.
> With uffd-wp we thought to register all the process memory with uffd-wp and
> then track changes with uffd-wp notifications. Back then it was considered
> only at the very general level without paying much attention to details.
> 
> So my initial thought was that we do register the entire memory with
> uffd-wp. If an area changes from RO to RW at some point, uffd-wp will
> generate notifications to the monitor, it would be able to notice the
> change and the write will continue normally.
> 
> If we are to limit uffd-wp register only to VMAs with VM_WRITE and even
> VM_MAYWRITE, we'd need a way to handle the possible changes of VMA
> protection and an ability to add monitoring for areas that changed from RO
> to RW.
> 
> Can't say I have a clear picture in mind at the moment, will continue to
> think about it.

Thanks for these details.  Though I have a question about how it's
used.

Since we're talking about replacing soft dirty with uffd-wp here, I
noticed that there's a major interface difference between soft-dirty
and uffd-wp: the soft-dirty was all about /proc operations so a
monitor process can easily monitor mostly any process on the system as
long as knowing its PID.  However I'm unsure about uffd-wp since
userfaultfd was always bound to a mm_struct.  For example, the syscall
userfaultfd() will always attach the current process mm_struct to the
newly created userfaultfd but it cannot be attached to another random
mm_struct of other processes.  Or is there any way that the CRIU
monitor process can gain an userfaultfd of any process of the system
somehow?

> 
> > > Particularity, for using uffd-wp as a replacement for soft-dirty would
> > > require it.
> > > 
> > > > 
> > > >  		/*
> > > >  		 * Check that this vma isn't already owned by a
> > > > @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	do {
> > > >  		cond_resched();
> > > > 
> > > > -		BUG_ON(!vma_can_userfault(vma));
> > > > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> > > >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> > > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > > @@ -1535,7 +1538,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  		 * provides for more strict behavior to notice
> > > >  		 * unregistration errors.
> > > >  		 */
> > > > -		if (!vma_can_userfault(cur))
> > > > +		if (!vma_can_userfault(cur, cur->vm_flags))
> > > >  			goto out_unlock;
> > > > 
> > > >  		found = true;
> > > > @@ -1549,7 +1552,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  	do {
> > > >  		cond_resched();
> > > > 
> > > > -		BUG_ON(!vma_can_userfault(vma));
> > > > +		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > > 
> > > >  		/*
> > > > @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> > > >  	return ret;
> > > >  }
> > > > 
> > > > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > +				    unsigned long arg)
> > > > +{
> > > > +	int ret;
> > > > +	struct uffdio_writeprotect uffdio_wp;
> > > > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > > +	struct userfaultfd_wake_range range;
> > > > +
> > > 
> > > In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
> > > layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
> > > was to return -EAGAIN if such race is encountered. I think the same would
> > > apply here.
> > 
> > I tried to understand the problem at [1] but failed... could you help
> > to clarify it a bit more?
> > 
> > I'm quoting some of the discussions from [1] here directly between you
> > and Pavel:
> > 
> >   > Since the monitor cannot assume that the process will access all its memory
> >   > it has to copy some pages "in the background". A simple monitor may look
> >   > like:
> >   > 
> >   > 	for (;;) {
> >   > 		wait_for_uffd_events(timeout);
> >   > 		handle_uffd_events();
> >   > 		uffd_copy(some not faulted pages);
> >   > 	}
> >   > 
> >   > Then, if the "background" uffd_copy() races with fork, the pages we've
> >   > copied may be already present in parent's mappings before the call to
> >   > copy_page_range() and may be not.
> >   > 
> >   > If the pages were not present, uffd_copy'ing them again to the child's
> >   > memory would be ok.
> >   >
> >   > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
> >   > again, child process will get memory corruption.
> > 
> > Here I don't understand why the child process will get memory
> > corruption if uffd_copy() caught the mmap_sem first.
> > 
> > If it did it, then IMHO when uffd_copy() copies the page again it'll
> > simply get a -EEXIST showing that the page has already been copied.
> > Could you explain on why there will be a data corruption?
> 
> Let's say we do post-copy migration of a process A with CRIU and its page at
> address 0x1000 is already copied. Now it modifies the contents of this
> page. At this point the contents of the page at 0x1000 is different on the
> source and the destination.
> Next, process A forks process B. The CRIU's uffd monitor gets
> UFFD_EVENT_FORK, and starts filling process B memory with UFFDIO_COPY.
> It may happen, that UFFDIO_COPY to 0x1000 of the process B will occur

I think this is the place I started to get confused...

The mmap copy phase and the FORK event path is in dup_mmap() as
mentioned in the patch too:

     dup_mmap()
        down_write(old_mm)
        down_write(new_mm)
        foreach(vma)
            copy_page_range()            (a)
        up_write(new_mm)
        up_write(old_mm)
        dup_userfaultfd_complete()       (b)

Here if we already received UFFD_EVENT_FORK and started to copy pages
to process B in the background, then we should have at least passed
(b) above since otherwise we won't even know the existance of process
B.  However if so, we should have already passed the point to copy
data at (a) too, then how could copy_page_range() race?  It seems that
I might have missed something important out there but it's not easy
for me to figure out myself...

Thanks,

> *before* fork() completes and it may race with copy_page_range().
> If UFFDIO_COPY wins the race, it will fill the page with the contents from
> the source, although the correct data is what process A set in that page.
> 
> Hope it helps.

> > >  
> > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df2cc96e77011cf7989208b206da9817e0321028
> > >
Mike Rapoport Jan. 25, 2019, 7:54 a.m. UTC | #5
On Thu, Jan 24, 2019 at 05:28:48PM +0800, Peter Xu wrote:
> On Thu, Jan 24, 2019 at 09:27:07AM +0200, Mike Rapoport wrote:
> > On Thu, Jan 24, 2019 at 12:56:15PM +0800, Peter Xu wrote:
> > > On Mon, Jan 21, 2019 at 12:42:33PM +0200, Mike Rapoport wrote:
> > > 
> > > [...]
> > > 
> > > > > @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > > 
> > > > >  		/* check not compatible vmas */
> > > > >  		ret = -EINVAL;
> > > > > -		if (!vma_can_userfault(cur))
> > > > > +		if (!vma_can_userfault(cur, vm_flags))
> > > > >  			goto out_unlock;
> > > > > 
> > > > >  		/*
> > > > > @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  			if (end & (vma_hpagesize - 1))
> > > > >  				goto out_unlock;
> > > > >  		}
> > > > > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> > > > > +			goto out_unlock;
> > > > 
> > > > This is problematic for the non-cooperative use-case. Way may still want to
> > > > monitor a read-only area because it may eventually become writable, e.g. if
> > > > the monitored process runs mprotect().
> > > 
> > > Firstly I think I should be able to change it to VM_MAYWRITE which
> > > seems to suite more.
> > > 
> > > Meanwhile, frankly speaking I didn't think a lot about how to nest the
> > > usages of uffd-wp and mprotect(), so far I was only considering it as
> > > a replacement of mprotect().  But indeed it can happen that the
> > > monitored process calls mprotect().  Is there an existing scenario of
> > > such usage?
> > > 
> > > The problem is I'm uncertain about whether this scenario can work
> > > after all.  Say, the monitor process A write protected process B's
> > > page P, so logically A will definitely receive a message before B
> > > writes to page P.  However here if we allow process B to do
> > > mprotect(PROT_WRITE) upon page P and grant write permission to it on
> > > its own, then A will not be able to capture the write operation at
> > > all?  Then I don't know how it can work here... or whether we should
> > > fail the mprotect() at least upon uffd-wp ranges?
> > 
> > The use-case we've discussed a while ago was to use uffd-wp instead of
> > soft-dirty for tracking memory changes in CRIU for pre-copy migration.
> > Currently, we enable soft-dirty for the migrated process and monitor
> > /proc/pid/pagemap between memory dump iterations to see what memory pages
> > have been changed.
> > With uffd-wp we thought to register all the process memory with uffd-wp and
> > then track changes with uffd-wp notifications. Back then it was considered
> > only at the very general level without paying much attention to details.
> > 
> > So my initial thought was that we do register the entire memory with
> > uffd-wp. If an area changes from RO to RW at some point, uffd-wp will
> > generate notifications to the monitor, it would be able to notice the
> > change and the write will continue normally.
> > 
> > If we are to limit uffd-wp register only to VMAs with VM_WRITE and even
> > VM_MAYWRITE, we'd need a way to handle the possible changes of VMA
> > protection and an ability to add monitoring for areas that changed from RO
> > to RW.
> > 
> > Can't say I have a clear picture in mind at the moment, will continue to
> > think about it.
> 
> Thanks for these details.  Though I have a question about how it's
> used.
> 
> Since we're talking about replacing soft dirty with uffd-wp here, I
> noticed that there's a major interface difference between soft-dirty
> and uffd-wp: the soft-dirty was all about /proc operations so a
> monitor process can easily monitor mostly any process on the system as
> long as knowing its PID.  However I'm unsure about uffd-wp since
> userfaultfd was always bound to a mm_struct.  For example, the syscall
> userfaultfd() will always attach the current process mm_struct to the
> newly created userfaultfd but it cannot be attached to another random
> mm_struct of other processes.  Or is there any way that the CRIU
> monitor process can gain an userfaultfd of any process of the system
> somehow?
 
Yes, there is. For CRIU to read the process state during snapshot (or one
the source in case of the migration) we inject a parasite code into the
victim process. The parasite code communicates with the "main" CRIU monitor
via UNIX socket to pass information that cannot be obtained from outside.
For uffd-wp usage we thought about creating the uffd context in the
parasite code, registering the memory and passing the userfault file
descriptor to the CRIU core via that UNIX socket.

> > 
> > > > Particularity, for using uffd-wp as a replacement for soft-dirty would
> > > > require it.
> > > > 
> > > > > 
> > > > >  		/*
> > > > >  		 * Check that this vma isn't already owned by a
> > > > > @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	do {
> > > > >  		cond_resched();
> > > > > 
> > > > > -		BUG_ON(!vma_can_userfault(vma));
> > > > > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > > >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> > > > >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> > > > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > > > @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> > > > >  	return ret;
> > > > >  }
> > > > > 
> > > > > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > > +				    unsigned long arg)
> > > > > +{
> > > > > +	int ret;
> > > > > +	struct uffdio_writeprotect uffdio_wp;
> > > > > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > > > +	struct userfaultfd_wake_range range;
> > > > > +
> > > > 
> > > > In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
> > > > layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
> > > > was to return -EAGAIN if such race is encountered. I think the same would
> > > > apply here.
> > > 
> > > I tried to understand the problem at [1] but failed... could you help
> > > to clarify it a bit more?
> > > 
> > > I'm quoting some of the discussions from [1] here directly between you
> > > and Pavel:
> > > 
> > >   > Since the monitor cannot assume that the process will access all its memory
> > >   > it has to copy some pages "in the background". A simple monitor may look
> > >   > like:
> > >   > 
> > >   > 	for (;;) {
> > >   > 		wait_for_uffd_events(timeout);
> > >   > 		handle_uffd_events();
> > >   > 		uffd_copy(some not faulted pages);
> > >   > 	}
> > >   > 
> > >   > Then, if the "background" uffd_copy() races with fork, the pages we've
> > >   > copied may be already present in parent's mappings before the call to
> > >   > copy_page_range() and may be not.
> > >   > 
> > >   > If the pages were not present, uffd_copy'ing them again to the child's
> > >   > memory would be ok.
> > >   >
> > >   > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
> > >   > again, child process will get memory corruption.
> > > 
> > > Here I don't understand why the child process will get memory
> > > corruption if uffd_copy() caught the mmap_sem first.
> > > 
> > > If it did it, then IMHO when uffd_copy() copies the page again it'll
> > > simply get a -EEXIST showing that the page has already been copied.
> > > Could you explain on why there will be a data corruption?
> > 
> > Let's say we do post-copy migration of a process A with CRIU and its page at
> > address 0x1000 is already copied. Now it modifies the contents of this
> > page. At this point the contents of the page at 0x1000 is different on the
> > source and the destination.
> > Next, process A forks process B. The CRIU's uffd monitor gets
> > UFFD_EVENT_FORK, and starts filling process B memory with UFFDIO_COPY.
> > It may happen, that UFFDIO_COPY to 0x1000 of the process B will occur
> 
> I think this is the place I started to get confused...
> 
> The mmap copy phase and the FORK event path is in dup_mmap() as
> mentioned in the patch too:
> 
>      dup_mmap()
>         down_write(old_mm)
>         down_write(new_mm)
>         foreach(vma)
>             copy_page_range()            (a)
>         up_write(new_mm)
>         up_write(old_mm)
>         dup_userfaultfd_complete()       (b)
> 
> Here if we already received UFFD_EVENT_FORK and started to copy pages
> to process B in the background, then we should have at least passed
> (b) above since otherwise we won't even know the existance of process
> B.  However if so, we should have already passed the point to copy
> data at (a) too, then how could copy_page_range() race?  It seems that
> I might have missed something important out there but it's not easy
> for me to figure out myself...

Apparently, I confused myself as well...
I clearly remember that there was a problem with fork() but the sequence
the causes it keeps evading me :(

Anyway, some mean of synchronization between uffd_copy and the
non-cooperative events is required. Take, for example, MADV_DONTNEED. When
it races with uffdio_copy() a process may end reading non zero values right
after MADV_DONTNEED call.

uffd monitor           | process
-----------------------+-------------------------------------------
uffdio_copy(0x1000)    | madvise(MADV_DONTNEED, 0x1000)
                       |    down_read(mmap_sem)
                       |    zap_pte_range(0x1000)
                       |    up_read(mmap_sem)
   down_read(mmap_sem) |
   copy()              |
   up_read(mmap_sem)   |
                       |  read(0x1000) != 0

Similar issues happen with mpremap() and munmap().

> Thanks,
> 
> > *before* fork() completes and it may race with copy_page_range().
> > If UFFDIO_COPY wins the race, it will fill the page with the contents from
> > the source, although the correct data is what process A set in that page.
> > 
> > Hope it helps.
> 
> > > >  
> > > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df2cc96e77011cf7989208b206da9817e0321028
> > > >
> 
> -- 
> Peter Xu
>
Peter Xu Jan. 25, 2019, 10:12 a.m. UTC | #6
On Fri, Jan 25, 2019 at 09:54:53AM +0200, Mike Rapoport wrote:
> On Thu, Jan 24, 2019 at 05:28:48PM +0800, Peter Xu wrote:
> > On Thu, Jan 24, 2019 at 09:27:07AM +0200, Mike Rapoport wrote:
> > > On Thu, Jan 24, 2019 at 12:56:15PM +0800, Peter Xu wrote:
> > > > On Mon, Jan 21, 2019 at 12:42:33PM +0200, Mike Rapoport wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > > @@ -1343,7 +1344,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > > > 
> > > > > >  		/* check not compatible vmas */
> > > > > >  		ret = -EINVAL;
> > > > > > -		if (!vma_can_userfault(cur))
> > > > > > +		if (!vma_can_userfault(cur, vm_flags))
> > > > > >  			goto out_unlock;
> > > > > > 
> > > > > >  		/*
> > > > > > @@ -1371,6 +1372,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > > >  			if (end & (vma_hpagesize - 1))
> > > > > >  				goto out_unlock;
> > > > > >  		}
> > > > > > +		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
> > > > > > +			goto out_unlock;
> > > > > 
> > > > > This is problematic for the non-cooperative use-case. Way may still want to
> > > > > monitor a read-only area because it may eventually become writable, e.g. if
> > > > > the monitored process runs mprotect().
> > > > 
> > > > Firstly I think I should be able to change it to VM_MAYWRITE which
> > > > seems to suite more.
> > > > 
> > > > Meanwhile, frankly speaking I didn't think a lot about how to nest the
> > > > usages of uffd-wp and mprotect(), so far I was only considering it as
> > > > a replacement of mprotect().  But indeed it can happen that the
> > > > monitored process calls mprotect().  Is there an existing scenario of
> > > > such usage?
> > > > 
> > > > The problem is I'm uncertain about whether this scenario can work
> > > > after all.  Say, the monitor process A write protected process B's
> > > > page P, so logically A will definitely receive a message before B
> > > > writes to page P.  However here if we allow process B to do
> > > > mprotect(PROT_WRITE) upon page P and grant write permission to it on
> > > > its own, then A will not be able to capture the write operation at
> > > > all?  Then I don't know how it can work here... or whether we should
> > > > fail the mprotect() at least upon uffd-wp ranges?
> > > 
> > > The use-case we've discussed a while ago was to use uffd-wp instead of
> > > soft-dirty for tracking memory changes in CRIU for pre-copy migration.
> > > Currently, we enable soft-dirty for the migrated process and monitor
> > > /proc/pid/pagemap between memory dump iterations to see what memory pages
> > > have been changed.
> > > With uffd-wp we thought to register all the process memory with uffd-wp and
> > > then track changes with uffd-wp notifications. Back then it was considered
> > > only at the very general level without paying much attention to details.
> > > 
> > > So my initial thought was that we do register the entire memory with
> > > uffd-wp. If an area changes from RO to RW at some point, uffd-wp will
> > > generate notifications to the monitor, it would be able to notice the
> > > change and the write will continue normally.
> > > 
> > > If we are to limit uffd-wp register only to VMAs with VM_WRITE and even
> > > VM_MAYWRITE, we'd need a way to handle the possible changes of VMA
> > > protection and an ability to add monitoring for areas that changed from RO
> > > to RW.
> > > 
> > > Can't say I have a clear picture in mind at the moment, will continue to
> > > think about it.
> > 
> > Thanks for these details.  Though I have a question about how it's
> > used.
> > 
> > Since we're talking about replacing soft dirty with uffd-wp here, I
> > noticed that there's a major interface difference between soft-dirty
> > and uffd-wp: the soft-dirty was all about /proc operations so a
> > monitor process can easily monitor mostly any process on the system as
> > long as knowing its PID.  However I'm unsure about uffd-wp since
> > userfaultfd was always bound to a mm_struct.  For example, the syscall
> > userfaultfd() will always attach the current process mm_struct to the
> > newly created userfaultfd but it cannot be attached to another random
> > mm_struct of other processes.  Or is there any way that the CRIU
> > monitor process can gain an userfaultfd of any process of the system
> > somehow?
>  
> Yes, there is. For CRIU to read the process state during snapshot (or one
> the source in case of the migration) we inject a parasite code into the
> victim process. The parasite code communicates with the "main" CRIU monitor
> via UNIX socket to pass information that cannot be obtained from outside.
> For uffd-wp usage we thought about creating the uffd context in the
> parasite code, registering the memory and passing the userfault file
> descriptor to the CRIU core via that UNIX socket.

Glad to know the black magic behind it...

> 
> > > 
> > > > > Particularity, for using uffd-wp as a replacement for soft-dirty would
> > > > > require it.
> > > > > 
> > > > > > 
> > > > > >  		/*
> > > > > >  		 * Check that this vma isn't already owned by a
> > > > > > @@ -1400,7 +1403,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > > >  	do {
> > > > > >  		cond_resched();
> > > > > > 
> > > > > > -		BUG_ON(!vma_can_userfault(vma));
> > > > > > +		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > > > >  		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
> > > > > >  		       vma->vm_userfaultfd_ctx.ctx != ctx);
> > > > > >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> > > > > > @@ -1760,6 +1763,46 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> > > > > >  	return ret;
> > > > > >  }
> > > > > > 
> > > > > > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > > > +				    unsigned long arg)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +	struct uffdio_writeprotect uffdio_wp;
> > > > > > +	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > > > > +	struct userfaultfd_wake_range range;
> > > > > > +
> > > > > 
> > > > > In the non-cooperative mode the userfaultfd_writeprotect() may race with VM
> > > > > layout changes, pretty much as uffdio_copy() [1]. My solution for uffdio_copy()
> > > > > was to return -EAGAIN if such race is encountered. I think the same would
> > > > > apply here.
> > > > 
> > > > I tried to understand the problem at [1] but failed... could you help
> > > > to clarify it a bit more?
> > > > 
> > > > I'm quoting some of the discussions from [1] here directly between you
> > > > and Pavel:
> > > > 
> > > >   > Since the monitor cannot assume that the process will access all its memory
> > > >   > it has to copy some pages "in the background". A simple monitor may look
> > > >   > like:
> > > >   > 
> > > >   > 	for (;;) {
> > > >   > 		wait_for_uffd_events(timeout);
> > > >   > 		handle_uffd_events();
> > > >   > 		uffd_copy(some not faulted pages);
> > > >   > 	}
> > > >   > 
> > > >   > Then, if the "background" uffd_copy() races with fork, the pages we've
> > > >   > copied may be already present in parent's mappings before the call to
> > > >   > copy_page_range() and may be not.
> > > >   > 
> > > >   > If the pages were not present, uffd_copy'ing them again to the child's
> > > >   > memory would be ok.
> > > >   >
> > > >   > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
> > > >   > again, child process will get memory corruption.
> > > > 
> > > > Here I don't understand why the child process will get memory
> > > > corruption if uffd_copy() caught the mmap_sem first.
> > > > 
> > > > If it did it, then IMHO when uffd_copy() copies the page again it'll
> > > > simply get a -EEXIST showing that the page has already been copied.
> > > > Could you explain on why there will be a data corruption?
> > > 
> > > Let's say we do post-copy migration of a process A with CRIU and its page at
> > > address 0x1000 is already copied. Now it modifies the contents of this
> > > page. At this point the contents of the page at 0x1000 is different on the
> > > source and the destination.
> > > Next, process A forks process B. The CRIU's uffd monitor gets
> > > UFFD_EVENT_FORK, and starts filling process B memory with UFFDIO_COPY.
> > > It may happen, that UFFDIO_COPY to 0x1000 of the process B will occur
> > 
> > I think this is the place I started to get confused...
> > 
> > The mmap copy phase and the FORK event path is in dup_mmap() as
> > mentioned in the patch too:
> > 
> >      dup_mmap()
> >         down_write(old_mm)
> >         down_write(new_mm)
> >         foreach(vma)
> >             copy_page_range()            (a)
> >         up_write(new_mm)
> >         up_write(old_mm)
> >         dup_userfaultfd_complete()       (b)
> > 
> > Here if we already received UFFD_EVENT_FORK and started to copy pages
> > to process B in the background, then we should have at least passed
> > (b) above since otherwise we won't even know the existance of process
> > B.  However if so, we should have already passed the point to copy
> > data at (a) too, then how could copy_page_range() race?  It seems that
> > I might have missed something important out there but it's not easy
> > for me to figure out myself...
> 
> Apparently, I confused myself as well...
> I clearly remember that there was a problem with fork() but the sequence
> the causes it keeps evading me :(
> 
> Anyway, some mean of synchronization between uffd_copy and the
> non-cooperative events is required. Take, for example, MADV_DONTNEED. When
> it races with uffdio_copy() a process may end reading non zero values right
> after MADV_DONTNEED call.
> 
> uffd monitor           | process
> -----------------------+-------------------------------------------
> uffdio_copy(0x1000)    | madvise(MADV_DONTNEED, 0x1000)
>                        |    down_read(mmap_sem)
>                        |    zap_pte_range(0x1000)
>                        |    up_read(mmap_sem)
>    down_read(mmap_sem) |
>    copy()              |
>    up_read(mmap_sem)   |
>                        |  read(0x1000) != 0
> 
> Similar issues happen with mpremap() and munmap().

I think I get the point this time especially with the context of CRIU
process postcopy migration in mind.

If my understanding is correct, here if UFFDIO_COPY returned -EAGAIN
due to this, continuous UFFDIO_COPY upon the same page will fail too
(which could be slightly confusing at the first glance since normally
that's what -EAGAIN means: it should let the caller to simply
retry...), and IMHO the only correct way to solve this to break out of
the copy_page_background() loop and receive uffd messages instead.
And then we'll notice that there's a UNMAP event of the page, then we
know that we don't need to UFFDIO_COPY this page, so instead of a real
"retry" we just don't do it at all.

Tricky... but it does make sense if considering this under the
postcopy scenario for either CRIU or even QEMU when migrating VMs.

Then I'll just simply follow df2cc96e77011cf79892 in
UFFDIO_WRITEPROTECT() too then.  Thanks for all these explanations!
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bc9f6230a3f0..6ff8773d6797 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -305,8 +305,11 @@  static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	if (!pmd_present(_pmd))
 		goto out;
 
-	if (pmd_trans_huge(_pmd))
+	if (pmd_trans_huge(_pmd)) {
+		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+			ret = true;
 		goto out;
+	}
 
 	/*
 	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -319,6 +322,8 @@  static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	 */
 	if (pte_none(*pte))
 		ret = true;
+	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
+		ret = true;
 	pte_unmap(pte);
 
 out:
@@ -1252,10 +1257,13 @@  static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+				     unsigned long vm_flags)
 {
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
-		vma_is_shmem(vma);
+	/* FIXME: add WP support to hugetlbfs and shmem */
+	return vma_is_anonymous(vma) ||
+		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
+		 !(vm_flags & VM_UFFD_WP));
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1287,15 +1295,8 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	vm_flags = 0;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 		vm_flags |= VM_UFFD_MISSING;
-	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
 		vm_flags |= VM_UFFD_WP;
-		/*
-		 * FIXME: remove the below error constraint by
-		 * implementing the wprotect tracking mode.
-		 */
-		ret = -EINVAL;
-		goto out;
-	}
 
 	ret = validate_range(mm, uffdio_register.range.start,
 			     uffdio_register.range.len);
@@ -1343,7 +1344,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_can_userfault(cur))
+		if (!vma_can_userfault(cur, vm_flags))
 			goto out_unlock;
 
 		/*
@@ -1371,6 +1372,8 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 			if (end & (vma_hpagesize - 1))
 				goto out_unlock;
 		}
+		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_WRITE))
+			goto out_unlock;
 
 		/*
 		 * Check that this vma isn't already owned by a
@@ -1400,7 +1403,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma));
+		BUG_ON(!vma_can_userfault(vma, vm_flags));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1535,7 +1538,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_can_userfault(cur))
+		if (!vma_can_userfault(cur, cur->vm_flags))
 			goto out_unlock;
 
 		found = true;
@@ -1549,7 +1552,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma));
+		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
 		/*
@@ -1760,6 +1763,46 @@  static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+				    unsigned long arg)
+{
+	int ret;
+	struct uffdio_writeprotect uffdio_wp;
+	struct uffdio_writeprotect __user *user_uffdio_wp;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+			   sizeof(struct uffdio_writeprotect)))
+		return -EFAULT;
+
+	ret = validate_range(ctx->mm, uffdio_wp.range.start,
+			     uffdio_wp.range.len);
+	if (ret)
+		return ret;
+
+	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+			       UFFDIO_WRITEPROTECT_MODE_WP))
+		return -EINVAL;
+	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
+	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
+		return -EINVAL;
+
+	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+				  uffdio_wp.range.len, uffdio_wp.mode &
+				  UFFDIO_WRITEPROTECT_MODE_WP);
+	if (ret)
+		return ret;
+
+	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
+		range.start = uffdio_wp.range.start;
+		range.len = uffdio_wp.range.len;
+		wake_userfault(ctx, &range);
+	}
+	return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1837,6 +1880,9 @@  static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_ZEROPAGE:
 		ret = userfaultfd_zeropage(ctx, arg);
 		break;
+	case UFFDIO_WRITEPROTECT:
+		ret = userfaultfd_writeprotect(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 48f1a7c2f1f0..11517f796275 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -52,6 +52,7 @@ 
 #define _UFFDIO_WAKE			(0x02)
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_WRITEPROTECT		(0x06)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -68,6 +69,8 @@ 
 				      struct uffdio_copy)
 #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 				      struct uffdio_zeropage)
+#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
+				      struct uffdio_writeprotect)
 
 /* read() structure */
 struct uffd_msg {
@@ -231,4 +234,12 @@  struct uffdio_zeropage {
 	__s64 zeropage;
 };
 
+struct uffdio_writeprotect {
+	struct uffdio_range range;
+	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
+#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
+#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
+	__u64 mode;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */