diff mbox series

[v3,7/9] userfaultfd: add UFFDIO_CONTINUE ioctl

Message ID 20210128224819.2651899-8-axelrasmussen@google.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: add minor fault handling | expand

Commit Message

Axel Rasmussen Jan. 28, 2021, 10:48 p.m. UTC
This ioctl is how userspace ought to resolve "minor" userfaults. The
idea is, userspace is notified that a minor fault has occurred. It might
change the contents of the page using its second non-UFFD mapping, or
not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured
the page contents are correct, carry on setting up the mapping".

Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.

It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`. We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case. (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 fs/userfaultfd.c                 | 67 +++++++++++++++++++++++++++++++
 include/linux/hugetlb.h          |  3 ++
 include/linux/userfaultfd_k.h    | 18 +++++++++
 include/uapi/linux/userfaultfd.h | 21 +++++++++-
 mm/hugetlb.c                     | 26 +++++++-----
 mm/userfaultfd.c                 | 69 +++++++++++++++++++++-----------
 6 files changed, 170 insertions(+), 34 deletions(-)

Comments

Peter Xu Feb. 1, 2021, 7:21 p.m. UTC | #1
On Thu, Jan 28, 2021 at 02:48:17PM -0800, Axel Rasmussen wrote:
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index f94a35296618..79e1f0155afa 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
>  unsigned long hugetlb_total_pages(void);
>  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags);
> +#ifdef CONFIG_USERFAULTFD

I'm confused why this is needed.. hugetlb_mcopy_atomic_pte() should only be
called in userfaultfd.c, but if without uffd config set it won't compile
either:

        obj-$(CONFIG_USERFAULTFD) += userfaultfd.o

>  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
>  				struct vm_area_struct *dst_vma,
>  				unsigned long dst_addr,
>  				unsigned long src_addr,
> +				enum mcopy_atomic_mode mode,
>  				struct page **pagep);
> +#endif
>  int hugetlb_reserve_pages(struct inode *inode, long from, long to,
>  						struct vm_area_struct *vma,
>  						vm_flags_t vm_flags);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index fb9abaeb4194..2fcb686211e8 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
>  
>  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
>  
> +/*
> + * The mode of operation for __mcopy_atomic and its helpers.
> + *
> + * This is almost an implementation detail (mcopy_atomic below doesn't take this
> + * as a parameter), but it's exposed here because memory-kind-specific
> + * implementations (e.g. hugetlbfs) need to know the mode of operation.
> + */
> +enum mcopy_atomic_mode {
> +	/* A normal copy_from_user into the destination range. */
> +	MCOPY_ATOMIC_NORMAL,
> +	/* Don't copy; map the destination range to the zero page. */
> +	MCOPY_ATOMIC_ZEROPAGE,
> +	/* Just setup the dst_vma, without modifying the underlying page(s). */
> +	MCOPY_ATOMIC_CONTINUE,
> +};
> +

Maybe better to keep this to where it's used, e.g. hugetlb.h where we've
defined hugetlb_mcopy_atomic_pte()?

[...]

> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6f9d8349f818..3d318ef3d180 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4647,6 +4647,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_USERFAULTFD

So I feel like you added the header ifdef for this.

IMHO we can drop both since that's what we have had.  I agree maybe it's better
to not compile that without CONFIG_USERFAULTFD but that may worth a standalone
patch anyways.

>  /*
>   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
>   * modifications for huge pages.
> @@ -4656,6 +4657,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  			    struct vm_area_struct *dst_vma,
>  			    unsigned long dst_addr,
>  			    unsigned long src_addr,
> +			    enum mcopy_atomic_mode mode,
>  			    struct page **pagep)
>  {
>  	struct address_space *mapping;
> @@ -4668,7 +4670,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	int ret;
>  	struct page *page;
>  
> -	if (!*pagep) {
> +	mapping = dst_vma->vm_file->f_mapping;
> +	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> +
> +	if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
>  		ret = -ENOMEM;
>  		page = alloc_huge_page(dst_vma, dst_addr, 0);
>  		if (IS_ERR(page))
> @@ -4685,6 +4690,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  			/* don't free the page */
>  			goto out;
>  		}
> +	} else if (mode == MCOPY_ATOMIC_CONTINUE) {
> +		ret = -EFAULT;
> +		page = find_lock_page(mapping, idx);
> +		*pagep = NULL;
> +		if (!page)
> +			goto out;
>  	} else {
>  		page = *pagep;
>  		*pagep = NULL;

I would write this as:

    if (mode == MCOPY_ATOMIC_CONTINUE)
        ...
    else if (!*pagep)
        ...
    else 
        ...

No strong opinion, but that'll look slightly cleaner to me.

[...]

> @@ -408,7 +407,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>  				      unsigned long dst_start,
>  				      unsigned long src_start,
>  				      unsigned long len,
> -				      bool zeropage);
> +				      enum mcopy_atomic_mode mode);
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
>  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> @@ -417,7 +416,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>  						unsigned long dst_addr,
>  						unsigned long src_addr,
>  						struct page **page,
> -						bool zeropage,
> +						enum mcopy_atomic_mode mode,
>  						bool wp_copy)
>  {
>  	ssize_t err;
> @@ -433,22 +432,38 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>  	 * and not in the radix tree.
>  	 */
>  	if (!(dst_vma->vm_flags & VM_SHARED)) {
> -		if (!zeropage)
> +		switch (mode) {
> +		case MCOPY_ATOMIC_NORMAL:
>  			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
>  					       dst_addr, src_addr, page,
>  					       wp_copy);
> -		else
> +			break;
> +		case MCOPY_ATOMIC_ZEROPAGE:
>  			err = mfill_zeropage_pte(dst_mm, dst_pmd,
>  						 dst_vma, dst_addr);
> +			break;
> +		/* It only makes sense to CONTINUE for shared memory. */
> +		case MCOPY_ATOMIC_CONTINUE:
> +			err = -EINVAL;
> +			break;
> +		}
>  	} else {
>  		VM_WARN_ON_ONCE(wp_copy);
> -		if (!zeropage)
> +		switch (mode) {
> +		case MCOPY_ATOMIC_NORMAL:
>  			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
>  						     dst_vma, dst_addr,
>  						     src_addr, page);
> -		else
> +			break;
> +		case MCOPY_ATOMIC_ZEROPAGE:
>  			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
>  						       dst_vma, dst_addr);
> +			break;
> +		case MCOPY_ATOMIC_CONTINUE:
> +			/* FIXME: Add minor fault interception for shmem. */
> +			err = -EINVAL;
> +			break;
> +		}
>  	}
>  
>  	return err;

The whole chunk above is not needed for hugetlbfs it seems - I'd avoid touching
the anon/shmem code path until it's being supported.

What you need is probably set zeropage as below in __mcopy_atomic():

    zeropage = (mode == MCOPY_ATOMIC_ZEROPAGE);

Before passing it over to mfill_atomic_pte().  As long as we reject
UFFDIO_CONTINUE with !hugetlbfs correctly that'll be enough iiuc.

Thanks,
Axel Rasmussen Feb. 1, 2021, 10:11 p.m. UTC | #2
On Mon, Feb 1, 2021 at 11:21 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 28, 2021 at 02:48:17PM -0800, Axel Rasmussen wrote:
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index f94a35296618..79e1f0155afa 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
> >  unsigned long hugetlb_total_pages(void);
> >  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >                       unsigned long address, unsigned int flags);
> > +#ifdef CONFIG_USERFAULTFD
>
> I'm confused why this is needed.. hugetlb_mcopy_atomic_pte() should only be
> called in userfaultfd.c, but if without uffd config set it won't compile
> either:
>
>         obj-$(CONFIG_USERFAULTFD) += userfaultfd.o

With this series as-is, but *without* the #ifdef CONFIG_USERFAULTFD
here, we introduce a bunch of build warnings like this:



In file included from ./include/linux/migrate.h:8:0,
                 from kernel/sched/sched.h:53,
                 from kernel/sched/isolation.c:10:
./include/linux/hugetlb.h:143:12: warning: 'enum mcopy_atomic_mode'
declared inside parameter list
     struct page **pagep);
            ^
./include/linux/hugetlb.h:143:12: warning: its scope is only this
definition or declaration, which is probably not what you want

And similarly we get an error about the "mode" parameter having an
incomplete type in hugetlb.c.



This is because enum mcopy_atomic_mode is defined in userfaultfd_k.h,
and that entire header is wrapped in a #ifdef CONFIG_USERFAULTFD. So
we either need to define enum mcopy_atomic_mode unconditionally, or we
need to #ifdef CONFIG_USERFAULTFD the references to it also.

- I opted not to move it outside the #ifdef CONFIG_USERFAULTFD in
userfaultfd_k.h (defining it unconditionally), because that seemed
messy to me.
- I opted not to define it unconditionally in hugetlb.h, because we'd
have to move it to userfaultfd_k.h anyway when shmem or other users
are introduced. I'm planning to send a series to add this a few days
or so after this series is merged, so it seems churn-y to move it
then.
- It seemed optimal to not compile hugetlb_mcopy_atomic_pte anyway
(even ignoring adding the continue ioctl), since as you point out
userfaultfd is the only caller.

Hopefully this clarifies this and the next two comments. Let me know
if you still feel strongly, I don't hate any of the alternatives, just
wanted to clarify that I had considered them and thought this approach
was best.

>
> >  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
> >                               struct vm_area_struct *dst_vma,
> >                               unsigned long dst_addr,
> >                               unsigned long src_addr,
> > +                             enum mcopy_atomic_mode mode,
> >                               struct page **pagep);
> > +#endif
> >  int hugetlb_reserve_pages(struct inode *inode, long from, long to,
> >                                               struct vm_area_struct *vma,
> >                                               vm_flags_t vm_flags);
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index fb9abaeb4194..2fcb686211e8 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
> >
> >  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
> >
> > +/*
> > + * The mode of operation for __mcopy_atomic and its helpers.
> > + *
> > + * This is almost an implementation detail (mcopy_atomic below doesn't take this
> > + * as a parameter), but it's exposed here because memory-kind-specific
> > + * implementations (e.g. hugetlbfs) need to know the mode of operation.
> > + */
> > +enum mcopy_atomic_mode {
> > +     /* A normal copy_from_user into the destination range. */
> > +     MCOPY_ATOMIC_NORMAL,
> > +     /* Don't copy; map the destination range to the zero page. */
> > +     MCOPY_ATOMIC_ZEROPAGE,
> > +     /* Just setup the dst_vma, without modifying the underlying page(s). */
> > +     MCOPY_ATOMIC_CONTINUE,
> > +};
> > +
>
> Maybe better to keep this to where it's used, e.g. hugetlb.h where we've
> defined hugetlb_mcopy_atomic_pte()?
>
> [...]
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 6f9d8349f818..3d318ef3d180 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -4647,6 +4647,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >       return ret;
> >  }
> >
> > +#ifdef CONFIG_USERFAULTFD
>
> So I feel like you added the header ifdef for this.
>
> IMHO we can drop both since that's what we have had.  I agree maybe it's better
> to not compile that without CONFIG_USERFAULTFD but that may worth a standalone
> patch anyways.
>
> >  /*
> >   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
> >   * modifications for huge pages.
> > @@ -4656,6 +4657,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >                           struct vm_area_struct *dst_vma,
> >                           unsigned long dst_addr,
> >                           unsigned long src_addr,
> > +                         enum mcopy_atomic_mode mode,
> >                           struct page **pagep)
> >  {
> >       struct address_space *mapping;
> > @@ -4668,7 +4670,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >       int ret;
> >       struct page *page;
> >
> > -     if (!*pagep) {
> > +     mapping = dst_vma->vm_file->f_mapping;
> > +     idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> > +
> > +     if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
> >               ret = -ENOMEM;
> >               page = alloc_huge_page(dst_vma, dst_addr, 0);
> >               if (IS_ERR(page))
> > @@ -4685,6 +4690,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >                       /* don't free the page */
> >                       goto out;
> >               }
> > +     } else if (mode == MCOPY_ATOMIC_CONTINUE) {
> > +             ret = -EFAULT;
> > +             page = find_lock_page(mapping, idx);
> > +             *pagep = NULL;
> > +             if (!page)
> > +                     goto out;
> >       } else {
> >               page = *pagep;
> >               *pagep = NULL;
>
> I would write this as:
>
>     if (mode == MCOPY_ATOMIC_CONTINUE)
>         ...
>     else if (!*pagep)
>         ...
>     else
>         ...
>
> No strong opinion, but that'll look slightly cleaner to me.

Agreed, I like that better as well. :)

>
> [...]
>
> > @@ -408,7 +407,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> >                                     unsigned long dst_start,
> >                                     unsigned long src_start,
> >                                     unsigned long len,
> > -                                   bool zeropage);
> > +                                   enum mcopy_atomic_mode mode);
> >  #endif /* CONFIG_HUGETLB_PAGE */
> >
> >  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> > @@ -417,7 +416,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> >                                               unsigned long dst_addr,
> >                                               unsigned long src_addr,
> >                                               struct page **page,
> > -                                             bool zeropage,
> > +                                             enum mcopy_atomic_mode mode,
> >                                               bool wp_copy)
> >  {
> >       ssize_t err;
> > @@ -433,22 +432,38 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> >        * and not in the radix tree.
> >        */
> >       if (!(dst_vma->vm_flags & VM_SHARED)) {
> > -             if (!zeropage)
> > +             switch (mode) {
> > +             case MCOPY_ATOMIC_NORMAL:
> >                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> >                                              dst_addr, src_addr, page,
> >                                              wp_copy);
> > -             else
> > +                     break;
> > +             case MCOPY_ATOMIC_ZEROPAGE:
> >                       err = mfill_zeropage_pte(dst_mm, dst_pmd,
> >                                                dst_vma, dst_addr);
> > +                     break;
> > +             /* It only makes sense to CONTINUE for shared memory. */
> > +             case MCOPY_ATOMIC_CONTINUE:
> > +                     err = -EINVAL;
> > +                     break;
> > +             }
> >       } else {
> >               VM_WARN_ON_ONCE(wp_copy);
> > -             if (!zeropage)
> > +             switch (mode) {
> > +             case MCOPY_ATOMIC_NORMAL:
> >                       err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
> >                                                    dst_vma, dst_addr,
> >                                                    src_addr, page);
> > -             else
> > +                     break;
> > +             case MCOPY_ATOMIC_ZEROPAGE:
> >                       err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
> >                                                      dst_vma, dst_addr);
> > +                     break;
> > +             case MCOPY_ATOMIC_CONTINUE:
> > +                     /* FIXME: Add minor fault interception for shmem. */
> > +                     err = -EINVAL;
> > +                     break;
> > +             }
> >       }
> >
> >       return err;
>
> The whole chunk above is not needed for hugetlbfs it seems - I'd avoid touching
> the anon/shmem code path until it's being supported.
>
> What you need is probably set zeropage as below in __mcopy_atomic():
>
>     zeropage = (mode == MCOPY_ATOMIC_ZEROPAGE);
>
> Before passing it over to mfill_atomic_pte().  As long as we reject
> UFFDIO_CONTINUE with !hugetlbfs correctly that'll be enough iiuc.

Seems reasonable to me, I'll make this change in v4.

>
> Thanks,
>
> --
> Peter Xu
>
Peter Xu Feb. 1, 2021, 10:40 p.m. UTC | #3
On Mon, Feb 01, 2021 at 02:11:55PM -0800, Axel Rasmussen wrote:
> On Mon, Feb 1, 2021 at 11:21 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Thu, Jan 28, 2021 at 02:48:17PM -0800, Axel Rasmussen wrote:
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index f94a35296618..79e1f0155afa 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
> > >  unsigned long hugetlb_total_pages(void);
> > >  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > >                       unsigned long address, unsigned int flags);
> > > +#ifdef CONFIG_USERFAULTFD
> >
> > I'm confused why this is needed.. hugetlb_mcopy_atomic_pte() should only be
> > called in userfaultfd.c, but if without uffd config set it won't compile
> > either:
> >
> >         obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
> 
> With this series as-is, but *without* the #ifdef CONFIG_USERFAULTFD
> here, we introduce a bunch of build warnings like this:
> 
> 
> 
> In file included from ./include/linux/migrate.h:8:0,
>                  from kernel/sched/sched.h:53,
>                  from kernel/sched/isolation.c:10:
> ./include/linux/hugetlb.h:143:12: warning: 'enum mcopy_atomic_mode'
> declared inside parameter list
>      struct page **pagep);
>             ^
> ./include/linux/hugetlb.h:143:12: warning: its scope is only this
> definition or declaration, which is probably not what you want
> 
> And similarly we get an error about the "mode" parameter having an
> incomplete type in hugetlb.c.
> 
> 
> 
> This is because enum mcopy_atomic_mode is defined in userfaultfd_k.h,
> and that entire header is wrapped in a #ifdef CONFIG_USERFAULTFD. So
> we either need to define enum mcopy_atomic_mode unconditionally, or we
> need to #ifdef CONFIG_USERFAULTFD the references to it also.
> 
> - I opted not to move it outside the #ifdef CONFIG_USERFAULTFD in
> userfaultfd_k.h (defining it unconditionally), because that seemed
> messy to me.
> - I opted not to define it unconditionally in hugetlb.h, because we'd
> have to move it to userfaultfd_k.h anyway when shmem or other users
> are introduced. I'm planning to send a series to add this a few days
> or so after this series is merged, so it seems churn-y to move it
> then.
> - It seemed optimal to not compile hugetlb_mcopy_atomic_pte anyway
> (even ignoring adding the continue ioctl), since as you point out
> userfaultfd is the only caller.
> 
> Hopefully this clarifies this and the next two comments. Let me know
> if you still feel strongly, I don't hate any of the alternatives, just
> wanted to clarify that I had considered them and thought this approach
> was best.

Then I'd suggest you use a standalone patch to put hugetlb_mcopy_atomic_pte()
into CONFIG_USERFAULTFD blocks, then propose your change with the minor mode.
Note that there're two hugetlb_mcopy_atomic_pte() defined in hugetlb.h.
Although I don't think it a problem since the other one is inlined - I think
you should still put that one into the same ifdef:

#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
						pte_t *dst_pte,
						struct vm_area_struct *dst_vma,
						unsigned long dst_addr,
						unsigned long src_addr,
						struct page **pagep)
{
	BUG();
	return 0;
}
#endif /* CONFIG_USERFAULTFD */

Let's also see whether Mike would have a preference on this.

Thanks,
Lokesh Gidra Feb. 1, 2021, 10:41 p.m. UTC | #4
On Thu, Jan 28, 2021 at 2:48 PM Axel Rasmussen <axelrasmussen@google.com> wrote:
>
> This ioctl is how userspace ought to resolve "minor" userfaults. The
> idea is, userspace is notified that a minor fault has occurred. It might
> change the contents of the page using its second non-UFFD mapping, or
> not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured
> the page contents are correct, carry on setting up the mapping".
>
> Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
> MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in
> the minor fault case, we already have some pre-existing underlying page.
> Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
> We'd just use memcpy() or similar instead.
>
> It turns out hugetlb_mcopy_atomic_pte() already does very close to what
> we want, if an existing page is provided via `struct page **pagep`. We
> already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
> just extend that design: add an enum for the three modes of operation,
> and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
> case. (Basically, look up the existing page, and avoid adding the
> existing page to the page cache or calling set_page_huge_active() on
> it.)
>
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  fs/userfaultfd.c                 | 67 +++++++++++++++++++++++++++++++
>  include/linux/hugetlb.h          |  3 ++
>  include/linux/userfaultfd_k.h    | 18 +++++++++
>  include/uapi/linux/userfaultfd.h | 21 +++++++++-
>  mm/hugetlb.c                     | 26 +++++++-----
>  mm/userfaultfd.c                 | 69 +++++++++++++++++++++-----------
>  6 files changed, 170 insertions(+), 34 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 968aca3e3ee9..80a3fca389b8 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1530,6 +1530,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>                 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
>                         ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
>
> +               /* CONTINUE ioctl is only supported for MINOR ranges. */
> +               if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
> +                       ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
> +
>                 /*
>                  * Now that we scanned all vmas we can already tell
>                  * userland which ioctls methods are guaranteed to
> @@ -1883,6 +1887,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>         return ret;
>  }
>
> +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> +{
> +       __s64 ret;
> +       struct uffdio_continue uffdio_continue;
> +       struct uffdio_continue __user *user_uffdio_continue;
> +       struct userfaultfd_wake_range range;
> +
> +       user_uffdio_continue = (struct uffdio_continue __user *)arg;
> +
> +       ret = -EAGAIN;
> +       if (READ_ONCE(ctx->mmap_changing))
> +               goto out;
> +
> +       ret = -EFAULT;
> +       if (copy_from_user(&uffdio_continue, user_uffdio_continue,
> +                          /* don't copy the output fields */
> +                          sizeof(uffdio_continue) - (sizeof(__s64))))
> +               goto out;
> +
> +       ret = validate_range(ctx->mm, &uffdio_continue.range.start,
> +                            uffdio_continue.range.len);
> +       if (ret)
> +               goto out;
> +
> +       ret = -EINVAL;
> +       /* double check for wraparound just in case. */
> +       if (uffdio_continue.range.start + uffdio_continue.range.len <=
> +           uffdio_continue.range.start) {
> +               goto out;
> +       }
> +       if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
> +               goto out;
> +
> +       if (mmget_not_zero(ctx->mm)) {
> +               ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
> +                                    uffdio_continue.range.len,
> +                                    &ctx->mmap_changing);
> +               mmput(ctx->mm);
> +       } else {
> +               return -ESRCH;
> +       }
> +
> +       if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
> +               return -EFAULT;
> +       if (ret < 0)
> +               goto out;
> +
> +       /* len == 0 would wake all */
> +       BUG_ON(!ret);
> +       range.len = ret;
> +       if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
> +               range.start = uffdio_continue.range.start;
> +               wake_userfault(ctx, &range);
> +       }
> +       ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
> +
> +out:
> +       return ret;
> +}
> +
>  static inline unsigned int uffd_ctx_features(__u64 user_features)
>  {
>         /*
> @@ -1967,6 +2031,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
>         case UFFDIO_WRITEPROTECT:
>                 ret = userfaultfd_writeprotect(ctx, arg);
>                 break;
> +       case UFFDIO_CONTINUE:
> +               ret = userfaultfd_continue(ctx, arg);
> +               break;
>         }
>         return ret;
>  }
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index f94a35296618..79e1f0155afa 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
>  unsigned long hugetlb_total_pages(void);
>  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>                         unsigned long address, unsigned int flags);
> +#ifdef CONFIG_USERFAULTFD
>  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
>                                 struct vm_area_struct *dst_vma,
>                                 unsigned long dst_addr,
>                                 unsigned long src_addr,
> +                               enum mcopy_atomic_mode mode,
>                                 struct page **pagep);
> +#endif
>  int hugetlb_reserve_pages(struct inode *inode, long from, long to,
>                                                 struct vm_area_struct *vma,
>                                                 vm_flags_t vm_flags);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index fb9abaeb4194..2fcb686211e8 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
>
>  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
>
> +/*
> + * The mode of operation for __mcopy_atomic and its helpers.
> + *
> + * This is almost an implementation detail (mcopy_atomic below doesn't take this
> + * as a parameter), but it's exposed here because memory-kind-specific
> + * implementations (e.g. hugetlbfs) need to know the mode of operation.
> + */
> +enum mcopy_atomic_mode {
> +       /* A normal copy_from_user into the destination range. */
> +       MCOPY_ATOMIC_NORMAL,
> +       /* Don't copy; map the destination range to the zero page. */
> +       MCOPY_ATOMIC_ZEROPAGE,
> +       /* Just setup the dst_vma, without modifying the underlying page(s). */
> +       MCOPY_ATOMIC_CONTINUE,
> +};
> +
>  extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
>                             unsigned long src_start, unsigned long len,
>                             bool *mmap_changing, __u64 mode);
> @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
>                               unsigned long dst_start,
>                               unsigned long len,
>                               bool *mmap_changing);
> +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> +                             unsigned long len, bool *mmap_changing);
>  extern int mwriteprotect_range(struct mm_struct *dst_mm,
>                                unsigned long start, unsigned long len,
>                                bool enable_wp, bool *mmap_changing);
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index f24dd4fcbad9..bafbeb1a2624 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -40,10 +40,12 @@
>         ((__u64)1 << _UFFDIO_WAKE |             \
>          (__u64)1 << _UFFDIO_COPY |             \
>          (__u64)1 << _UFFDIO_ZEROPAGE |         \
> -        (__u64)1 << _UFFDIO_WRITEPROTECT)
> +        (__u64)1 << _UFFDIO_WRITEPROTECT |     \
> +        (__u64)1 << _UFFDIO_CONTINUE)
>  #define UFFD_API_RANGE_IOCTLS_BASIC            \
>         ((__u64)1 << _UFFDIO_WAKE |             \
> -        (__u64)1 << _UFFDIO_COPY)
> +        (__u64)1 << _UFFDIO_COPY |             \
> +        (__u64)1 << _UFFDIO_CONTINUE)
>
>  /*
>   * Valid ioctl command number range with this API is from 0x00 to
> @@ -59,6 +61,7 @@
>  #define _UFFDIO_COPY                   (0x03)
>  #define _UFFDIO_ZEROPAGE               (0x04)
>  #define _UFFDIO_WRITEPROTECT           (0x06)
> +#define _UFFDIO_CONTINUE               (0x07)
>  #define _UFFDIO_API                    (0x3F)
>
>  /* userfaultfd ioctl ids */
> @@ -77,6 +80,8 @@
>                                       struct uffdio_zeropage)
>  #define UFFDIO_WRITEPROTECT    _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
>                                       struct uffdio_writeprotect)
> +#define UFFDIO_CONTINUE                _IOR(UFFDIO, _UFFDIO_CONTINUE,  \
> +                                    struct uffdio_continue)
>
>  /* read() structure */
>  struct uffd_msg {
> @@ -268,6 +273,18 @@ struct uffdio_writeprotect {
>         __u64 mode;
>  };
>
> +struct uffdio_continue {
> +       struct uffdio_range range;
> +#define UFFDIO_CONTINUE_MODE_DONTWAKE          ((__u64)1<<0)
> +       __u64 mode;
> +
> +       /*
> +        * Fields below here are written by the ioctl and must be at the end:
> +        * the copy_from_user will not read past here.
> +        */
> +       __s64 mapped;
> +};
> +
>  /*
>   * Flags for the userfaultfd(2) system call itself.
>   */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6f9d8349f818..3d318ef3d180 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4647,6 +4647,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>         return ret;
>  }
>
> +#ifdef CONFIG_USERFAULTFD
>  /*
>   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
>   * modifications for huge pages.
> @@ -4656,6 +4657,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                             struct vm_area_struct *dst_vma,
>                             unsigned long dst_addr,
>                             unsigned long src_addr,
> +                           enum mcopy_atomic_mode mode,
>                             struct page **pagep)
>  {
>         struct address_space *mapping;
> @@ -4668,7 +4670,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>         int ret;
>         struct page *page;
>
> -       if (!*pagep) {
> +       mapping = dst_vma->vm_file->f_mapping;
> +       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> +
> +       if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
>                 ret = -ENOMEM;
>                 page = alloc_huge_page(dst_vma, dst_addr, 0);
>                 if (IS_ERR(page))
> @@ -4685,6 +4690,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                         /* don't free the page */
>                         goto out;
>                 }
> +       } else if (mode == MCOPY_ATOMIC_CONTINUE) {
> +               ret = -EFAULT;
> +               page = find_lock_page(mapping, idx);
> +               *pagep = NULL;
> +               if (!page)
> +                       goto out;
>         } else {
>                 page = *pagep;
>                 *pagep = NULL;
> @@ -4697,13 +4708,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>          */
>         __SetPageUptodate(page);
>
> -       mapping = dst_vma->vm_file->f_mapping;
> -       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> -
> -       /*
> -        * If shared, add to page cache
> -        */
> -       if (vm_shared) {
> +       /* Add shared, newly allocated pages to the page cache. */
> +       if (vm_shared && mode != MCOPY_ATOMIC_CONTINUE) {
>                 size = i_size_read(mapping->host) >> huge_page_shift(h);
>                 ret = -EFAULT;
>                 if (idx >= size)
> @@ -4763,7 +4769,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>         update_mmu_cache(dst_vma, dst_addr, dst_pte);
>
>         spin_unlock(ptl);
> -       set_page_huge_active(page);
> +       if (mode != MCOPY_ATOMIC_CONTINUE)
> +               set_page_huge_active(page);
>         if (vm_shared)
>                 unlock_page(page);
>         ret = 0;
> @@ -4777,6 +4784,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>         put_page(page);
>         goto out;
>  }
> +#endif
>
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>                          struct page **pages, struct vm_area_struct **vmas,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index b2ce61c1b50d..a762b9cefaea 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>                                               unsigned long dst_start,
>                                               unsigned long src_start,
>                                               unsigned long len,
> -                                             bool zeropage)
> +                                             enum mcopy_atomic_mode mode)
>  {
>         int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
>         int vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>          * by THP.  Since we can not reliably insert a zero page, this
>          * feature is not supported.
>          */
> -       if (zeropage) {
> +       if (mode == MCOPY_ATOMIC_ZEROPAGE) {
>                 mmap_read_unlock(dst_mm);
>                 return -EINVAL;
>         }
> @@ -273,8 +273,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>         }
>
>         while (src_addr < src_start + len) {
> -               pte_t dst_pteval;
> -
>                 BUG_ON(dst_addr >= dst_start + len);
>
>                 /*
> @@ -297,16 +295,17 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>                         goto out_unlock;
>                 }
>
> -               err = -EEXIST;
> -               dst_pteval = huge_ptep_get(dst_pte);
> -               if (!huge_pte_none(dst_pteval)) {
> -                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> -                       i_mmap_unlock_read(mapping);
> -                       goto out_unlock;
> +               if (mode != MCOPY_ATOMIC_CONTINUE) {
> +                       if (!huge_pte_none(huge_ptep_get(dst_pte))) {
> +                               err = -EEXIST;
> +                               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> +                               i_mmap_unlock_read(mapping);
> +                               goto out_unlock;
> +                       }
>                 }
>
>                 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
> -                                               dst_addr, src_addr, &page);
> +                                              dst_addr, src_addr, mode, &page);
>
>                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>                 i_mmap_unlock_read(mapping);
> @@ -408,7 +407,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>                                       unsigned long dst_start,
>                                       unsigned long src_start,
>                                       unsigned long len,
> -                                     bool zeropage);
> +                                     enum mcopy_atomic_mode mode);
>  #endif /* CONFIG_HUGETLB_PAGE */
>
>  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> @@ -417,7 +416,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>                                                 unsigned long dst_addr,
>                                                 unsigned long src_addr,
>                                                 struct page **page,
> -                                               bool zeropage,
> +                                               enum mcopy_atomic_mode mode,
>                                                 bool wp_copy)
>  {
>         ssize_t err;
> @@ -433,22 +432,38 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>          * and not in the radix tree.
>          */
>         if (!(dst_vma->vm_flags & VM_SHARED)) {
> -               if (!zeropage)
> +               switch (mode) {
> +               case MCOPY_ATOMIC_NORMAL:
>                         err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
>                                                dst_addr, src_addr, page,
>                                                wp_copy);
> -               else
> +                       break;
> +               case MCOPY_ATOMIC_ZEROPAGE:
>                         err = mfill_zeropage_pte(dst_mm, dst_pmd,
>                                                  dst_vma, dst_addr);
> +                       break;
> +               /* It only makes sense to CONTINUE for shared memory. */
> +               case MCOPY_ATOMIC_CONTINUE:
> +                       err = -EINVAL;
> +                       break;
> +               }
>         } else {
>                 VM_WARN_ON_ONCE(wp_copy);
> -               if (!zeropage)
> +               switch (mode) {
> +               case MCOPY_ATOMIC_NORMAL:
>                         err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
>                                                      dst_vma, dst_addr,
>                                                      src_addr, page);
> -               else
> +                       break;
> +               case MCOPY_ATOMIC_ZEROPAGE:
>                         err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
>                                                        dst_vma, dst_addr);
> +                       break;
> +               case MCOPY_ATOMIC_CONTINUE:
> +                       /* FIXME: Add minor fault interception for shmem. */

I hope you plan to implement the non-hugepage case soon. Looking
forward to using the feature. :)
> +                       err = -EINVAL;
> +                       break;
> +               }
>         }
>
>         return err;
> @@ -458,7 +473,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>                                               unsigned long dst_start,
>                                               unsigned long src_start,
>                                               unsigned long len,
> -                                             bool zeropage,
> +                                             enum mcopy_atomic_mode mcopy_mode,
>                                               bool *mmap_changing,
>                                               __u64 mode)
>  {
> @@ -527,7 +542,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>          */
>         if (is_vm_hugetlb_page(dst_vma))
>                 return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
> -                                               src_start, len, zeropage);
> +                                               src_start, len, mcopy_mode);
>
>         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>                 goto out_unlock;
> @@ -577,7 +592,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>                 BUG_ON(pmd_trans_huge(*dst_pmd));
>
>                 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> -                                      src_addr, &page, zeropage, wp_copy);
> +                                      src_addr, &page, mcopy_mode, wp_copy);
>                 cond_resched();
>
>                 if (unlikely(err == -ENOENT)) {
> @@ -626,14 +641,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
>                      unsigned long src_start, unsigned long len,
>                      bool *mmap_changing, __u64 mode)
>  {
> -       return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
> -                             mmap_changing, mode);
> +       return __mcopy_atomic(dst_mm, dst_start, src_start, len,
> +                             MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
>  }
>
>  ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
>                        unsigned long len, bool *mmap_changing)
>  {
> -       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
> +       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
> +                             mmap_changing, 0);
> +}
> +
> +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
> +                      unsigned long len, bool *mmap_changing)
> +{
> +       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
> +                             mmap_changing, 0);
>  }
>
>  int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> --
> 2.30.0.365.g02bc693789-goog
>
Mike Kravetz Feb. 1, 2021, 11:42 p.m. UTC | #5
On 2/1/21 2:40 PM, Peter Xu wrote:
> On Mon, Feb 01, 2021 at 02:11:55PM -0800, Axel Rasmussen wrote:
>> On Mon, Feb 1, 2021 at 11:21 AM Peter Xu <peterx@redhat.com> wrote:
>>>
>>> On Thu, Jan 28, 2021 at 02:48:17PM -0800, Axel Rasmussen wrote:
>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>> index f94a35296618..79e1f0155afa 100644
>>>> --- a/include/linux/hugetlb.h
>>>> +++ b/include/linux/hugetlb.h
>>>> @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
>>>>  unsigned long hugetlb_total_pages(void);
>>>>  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>                       unsigned long address, unsigned int flags);
>>>> +#ifdef CONFIG_USERFAULTFD
>>>
>>> I'm confused why this is needed.. hugetlb_mcopy_atomic_pte() should only be
>>> called in userfaultfd.c, but if without uffd config set it won't compile
>>> either:
>>>
>>>         obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
>>
>> With this series as-is, but *without* the #ifdef CONFIG_USERFAULTFD
>> here, we introduce a bunch of build warnings like this:
>>
>>
>>
>> In file included from ./include/linux/migrate.h:8:0,
>>                  from kernel/sched/sched.h:53,
>>                  from kernel/sched/isolation.c:10:
>> ./include/linux/hugetlb.h:143:12: warning: 'enum mcopy_atomic_mode'
>> declared inside parameter list
>>      struct page **pagep);
>>             ^
>> ./include/linux/hugetlb.h:143:12: warning: its scope is only this
>> definition or declaration, which is probably not what you want
>>
>> And similarly we get an error about the "mode" parameter having an
>> incomplete type in hugetlb.c.
>>
>>
>>
>> This is because enum mcopy_atomic_mode is defined in userfaultfd_k.h,
>> and that entire header is wrapped in a #ifdef CONFIG_USERFAULTFD. So
>> we either need to define enum mcopy_atomic_mode unconditionally, or we
>> need to #ifdef CONFIG_USERFAULTFD the references to it also.
>>
>> - I opted not to move it outside the #ifdef CONFIG_USERFAULTFD in
>> userfaultfd_k.h (defining it unconditionally), because that seemed
>> messy to me.
>> - I opted not to define it unconditionally in hugetlb.h, because we'd
>> have to move it to userfaultfd_k.h anyway when shmem or other users
>> are introduced. I'm planning to send a series to add this a few days
>> or so after this series is merged, so it seems churn-y to move it
>> then.
>> - It seemed optimal to not compile hugetlb_mcopy_atomic_pte anyway
>> (even ignoring adding the continue ioctl), since as you point out
>> userfaultfd is the only caller.
>>
>> Hopefully this clarifies this and the next two comments. Let me know
>> if you still feel strongly, I don't hate any of the alternatives, just
>> wanted to clarify that I had considered them and thought this approach
>> was best.
> 
> Then I'd suggest you use a standalone patch to put hugetlb_mcopy_atomic_pte()
> into CONFIG_USERFAULTFD blocks, then propose your change with the minor mode.
> Note that there're two hugetlb_mcopy_atomic_pte() defined in hugetlb.h.
> Although I don't think it a problem since the other one is inlined - I think
> you should still put that one into the same ifdef:
> 
> #ifdef CONFIG_USERFAULTFD
> static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> 						pte_t *dst_pte,
> 						struct vm_area_struct *dst_vma,
> 						unsigned long dst_addr,
> 						unsigned long src_addr,
> 						struct page **pagep)
> {
> 	BUG();
> 	return 0;
> }
> #endif /* CONFIG_USERFAULTFD */
> 
> Let's also see whether Mike would have a preference on this.
> 

No real preference.  Just need to fix up the argument list in that
second definition.
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 968aca3e3ee9..80a3fca389b8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1530,6 +1530,10 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
 
+		/* CONTINUE ioctl is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1883,6 +1887,66 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_continue uffdio_continue;
+	struct uffdio_continue __user *user_uffdio_continue;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+			   /* don't copy the output fields */
+			   sizeof(uffdio_continue) - (sizeof(__s64))))
+		goto out;
+
+	ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+			     uffdio_continue.range.len);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	/* double check for wraparound just in case. */
+	if (uffdio_continue.range.start + uffdio_continue.range.len <=
+	    uffdio_continue.range.start) {
+		goto out;
+	}
+	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+		goto out;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
+				     uffdio_continue.range.len,
+				     &ctx->mmap_changing);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+
+	/* len == 0 would wake all */
+	BUG_ON(!ret);
+	range.len = ret;
+	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+		range.start = uffdio_continue.range.start;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+	return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1967,6 +2031,9 @@  static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_WRITEPROTECT:
 		ret = userfaultfd_writeprotect(ctx, arg);
 		break;
+	case UFFDIO_CONTINUE:
+		ret = userfaultfd_continue(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f94a35296618..79e1f0155afa 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -135,11 +135,14 @@  void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+#ifdef CONFIG_USERFAULTFD
 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				struct vm_area_struct *dst_vma,
 				unsigned long dst_addr,
 				unsigned long src_addr,
+				enum mcopy_atomic_mode mode,
 				struct page **pagep);
+#endif
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fb9abaeb4194..2fcb686211e8 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -37,6 +37,22 @@  extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+	/* A normal copy_from_user into the destination range. */
+	MCOPY_ATOMIC_NORMAL,
+	/* Don't copy; map the destination range to the zero page. */
+	MCOPY_ATOMIC_ZEROPAGE,
+	/* Just setup the dst_vma, without modifying the underlying page(s). */
+	MCOPY_ATOMIC_CONTINUE,
+};
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
 			    bool *mmap_changing, __u64 mode);
@@ -44,6 +60,8 @@  extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len,
 			      bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+			      unsigned long len, bool *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, bool *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index f24dd4fcbad9..bafbeb1a2624 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -40,10 +40,12 @@ 
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
-	 (__u64)1 << _UFFDIO_WRITEPROTECT)
+	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
-	 (__u64)1 << _UFFDIO_COPY)
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -59,6 +61,7 @@ 
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_WRITEPROTECT		(0x06)
+#define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -77,6 +80,8 @@ 
 				      struct uffdio_zeropage)
 #define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
 				      struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE		_IOR(UFFDIO, _UFFDIO_CONTINUE,	\
+				     struct uffdio_continue)
 
 /* read() structure */
 struct uffd_msg {
@@ -268,6 +273,18 @@  struct uffdio_writeprotect {
 	__u64 mode;
 };
 
+struct uffdio_continue {
+	struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * Fields below here are written by the ioctl and must be at the end:
+	 * the copy_from_user will not read past here.
+	 */
+	__s64 mapped;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f9d8349f818..3d318ef3d180 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4647,6 +4647,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
 /*
  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  * modifications for huge pages.
@@ -4656,6 +4657,7 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct vm_area_struct *dst_vma,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
+			    enum mcopy_atomic_mode mode,
 			    struct page **pagep)
 {
 	struct address_space *mapping;
@@ -4668,7 +4670,10 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	int ret;
 	struct page *page;
 
-	if (!*pagep) {
+	mapping = dst_vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+	if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
 		ret = -ENOMEM;
 		page = alloc_huge_page(dst_vma, dst_addr, 0);
 		if (IS_ERR(page))
@@ -4685,6 +4690,12 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			/* don't free the page */
 			goto out;
 		}
+	} else if (mode == MCOPY_ATOMIC_CONTINUE) {
+		ret = -EFAULT;
+		page = find_lock_page(mapping, idx);
+		*pagep = NULL;
+		if (!page)
+			goto out;
 	} else {
 		page = *pagep;
 		*pagep = NULL;
@@ -4697,13 +4708,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	__SetPageUptodate(page);
 
-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-	/*
-	 * If shared, add to page cache
-	 */
-	if (vm_shared) {
+	/* Add shared, newly allocated pages to the page cache. */
+	if (vm_shared && mode != MCOPY_ATOMIC_CONTINUE) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		ret = -EFAULT;
 		if (idx >= size)
@@ -4763,7 +4769,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
 	spin_unlock(ptl);
-	set_page_huge_active(page);
+	if (mode != MCOPY_ATOMIC_CONTINUE)
+		set_page_huge_active(page);
 	if (vm_shared)
 		unlock_page(page);
 	ret = 0;
@@ -4777,6 +4784,7 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	put_page(page);
 	goto out;
 }
+#endif
 
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b2ce61c1b50d..a762b9cefaea 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage)
+					      enum mcopy_atomic_mode mode)
 {
 	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 	 * by THP.  Since we can not reliably insert a zero page, this
 	 * feature is not supported.
 	 */
-	if (zeropage) {
+	if (mode == MCOPY_ATOMIC_ZEROPAGE) {
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -273,8 +273,6 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 	}
 
 	while (src_addr < src_start + len) {
-		pte_t dst_pteval;
-
 		BUG_ON(dst_addr >= dst_start + len);
 
 		/*
@@ -297,16 +295,17 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 			goto out_unlock;
 		}
 
-		err = -EEXIST;
-		dst_pteval = huge_ptep_get(dst_pte);
-		if (!huge_pte_none(dst_pteval)) {
-			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-			i_mmap_unlock_read(mapping);
-			goto out_unlock;
+		if (mode != MCOPY_ATOMIC_CONTINUE) {
+			if (!huge_pte_none(huge_ptep_get(dst_pte))) {
+				err = -EEXIST;
+				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+				i_mmap_unlock_read(mapping);
+				goto out_unlock;
+			}
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-						dst_addr, src_addr, &page);
+					       dst_addr, src_addr, mode, &page);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -408,7 +407,7 @@  extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      bool zeropage);
+				      enum mcopy_atomic_mode mode);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -417,7 +416,7 @@  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						struct page **page,
-						bool zeropage,
+						enum mcopy_atomic_mode mode,
 						bool wp_copy)
 {
 	ssize_t err;
@@ -433,22 +432,38 @@  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	 * and not in the radix tree.
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
-		if (!zeropage)
+		switch (mode) {
+		case MCOPY_ATOMIC_NORMAL:
 			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
 					       dst_addr, src_addr, page,
 					       wp_copy);
-		else
+			break;
+		case MCOPY_ATOMIC_ZEROPAGE:
 			err = mfill_zeropage_pte(dst_mm, dst_pmd,
 						 dst_vma, dst_addr);
+			break;
+		/* It only makes sense to CONTINUE for shared memory. */
+		case MCOPY_ATOMIC_CONTINUE:
+			err = -EINVAL;
+			break;
+		}
 	} else {
 		VM_WARN_ON_ONCE(wp_copy);
-		if (!zeropage)
+		switch (mode) {
+		case MCOPY_ATOMIC_NORMAL:
 			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
 						     dst_vma, dst_addr,
 						     src_addr, page);
-		else
+			break;
+		case MCOPY_ATOMIC_ZEROPAGE:
 			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
 						       dst_vma, dst_addr);
+			break;
+		case MCOPY_ATOMIC_CONTINUE:
+			/* FIXME: Add minor fault interception for shmem. */
+			err = -EINVAL;
+			break;
+		}
 	}
 
 	return err;
@@ -458,7 +473,7 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage,
+					      enum mcopy_atomic_mode mcopy_mode,
 					      bool *mmap_changing,
 					      __u64 mode)
 {
@@ -527,7 +542,7 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, zeropage);
+						src_start, len, mcopy_mode);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -577,7 +592,7 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
 		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       src_addr, &page, zeropage, wp_copy);
+				       src_addr, &page, mcopy_mode, wp_copy);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
@@ -626,14 +641,22 @@  ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 		     unsigned long src_start, unsigned long len,
 		     bool *mmap_changing, __u64 mode)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-			      mmap_changing, mode);
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+			      MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len, bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+			      mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+		       unsigned long len, bool *mmap_changing)
+{
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+			      mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,