diff mbox series

[16/23] hugetlb/userfaultfd: Take care of UFFDIO_COPY_MODE_WP

Message ID 20210323004912.35132-17-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd-wp: Support shmem and hugetlbfs | expand

Commit Message

Peter Xu March 23, 2021, 12:49 a.m. UTC
Firstly, pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout
the stack.  Then, apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with
UFFDIO_COPY.  Introduce huge_pte_mkuffd_wp() for it.

Note that similar to how we've handled shmem, we'd better keep setting the
dirty bit even if UFFDIO_COPY_MODE_WP is provided, so that the core mm will
know this page contains valid data and never drop it.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/asm-generic/hugetlb.h |  5 +++++
 include/linux/hugetlb.h       |  6 ++++--
 mm/hugetlb.c                  | 22 +++++++++++++++++-----
 mm/userfaultfd.c              | 12 ++++++++----
 4 files changed, 34 insertions(+), 11 deletions(-)

Comments

Mike Kravetz April 21, 2021, 11:06 p.m. UTC | #1
On 3/22/21 5:49 PM, Peter Xu wrote:
> Firstly, pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout
> the stack.  Then, apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with
> UFFDIO_COPY.  Introduce huge_pte_mkuffd_wp() for it.
> 
> Note that similar to how we've handled shmem, we'd better keep setting the
> dirty bit even if UFFDIO_COPY_MODE_WP is provided, so that the core mm will
> know this page contains valid data and never drop it.

There is nothing wrong with setting the dirty bit in this manner to be
consistent.  But, since hugetlb pages are only managed by hugetlbfs, the
core mm will not drop them.

> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/asm-generic/hugetlb.h |  5 +++++
>  include/linux/hugetlb.h       |  6 ++++--
>  mm/hugetlb.c                  | 22 +++++++++++++++++-----
>  mm/userfaultfd.c              | 12 ++++++++----
>  4 files changed, 34 insertions(+), 11 deletions(-)
> 
> diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> index 8e1e6244a89d..548212eccbd6 100644
> --- a/include/asm-generic/hugetlb.h
> +++ b/include/asm-generic/hugetlb.h
> @@ -27,6 +27,11 @@ static inline pte_t huge_pte_mkdirty(pte_t pte)
>  	return pte_mkdirty(pte);
>  }
>  
> +static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte_mkuffd_wp(pte);
> +}
> +

Just want to verify that userfaultfd wp support is only enabled for
x86_64 now?  I only ask because there are arch specific hugetlb pte
manipulation routines for some architectures. 

>  static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
>  {
>  	return pte_modify(pte, newprot);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index a7f7d5f328dc..ef8d2b8427b1 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -141,7 +141,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
>  				unsigned long dst_addr,
>  				unsigned long src_addr,
>  				enum mcopy_atomic_mode mode,
> -				struct page **pagep);
> +				struct page **pagep,
> +				bool wp_copy);
>  #endif /* CONFIG_USERFAULTFD */
>  bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
>  						struct vm_area_struct *vma,
> @@ -321,7 +322,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  						unsigned long dst_addr,
>  						unsigned long src_addr,
>  						enum mcopy_atomic_mode mode,
> -						struct page **pagep)
> +						struct page **pagep,
> +						bool wp_copy)
>  {
>  	BUG();
>  	return 0;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index def2c7ddf3ae..f0e55b341ebd 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4725,7 +4725,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  			    unsigned long dst_addr,
>  			    unsigned long src_addr,
>  			    enum mcopy_atomic_mode mode,
> -			    struct page **pagep)
> +			    struct page **pagep,
> +			    bool wp_copy)
>  {
>  	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
>  	struct address_space *mapping;
> @@ -4822,17 +4823,28 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
>  	}
>  
> -	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
> -	if (is_continue && !vm_shared)
> +	/*
> +	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
> +	 * with wp flag set, don't set pte write bit.
> +	 */
> +	if (wp_copy || (is_continue && !vm_shared))
>  		writable = 0;
>  	else
>  		writable = dst_vma->vm_flags & VM_WRITE;
>  
>  	_dst_pte = make_huge_pte(dst_vma, page, writable);
> -	if (writable)
> -		_dst_pte = huge_pte_mkdirty(_dst_pte);
> +	/*
> +	 * Always mark UFFDIO_COPY page dirty; note that this may not be
> +	 * extremely important for hugetlbfs for now since swapping is not
> +	 * supported, but we should still be clear in that this page cannot be
> +	 * thrown away at will, even if write bit not set.

As mentioned earlier there should not be any issue with hugetlb pages
being thrown away without dirty set.  Perhaps, the comment should
reflect that this is mostly for consistency.

Note to self: this may help when I get back to hugetlb soft dirty
support.

Other than that, patch looks good.
Peter Xu April 22, 2021, 1:14 a.m. UTC | #2
On Wed, Apr 21, 2021 at 04:06:39PM -0700, Mike Kravetz wrote:
> On 3/22/21 5:49 PM, Peter Xu wrote:
> > Firstly, pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout
> > the stack.  Then, apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with
> > UFFDIO_COPY.  Introduce huge_pte_mkuffd_wp() for it.
> > 
> > Note that similar to how we've handled shmem, we'd better keep setting the
> > dirty bit even if UFFDIO_COPY_MODE_WP is provided, so that the core mm will
> > know this page contains valid data and never drop it.
> 
> There is nothing wrong with setting the dirty bit in this manner to be
> consistent.  But, since hugetlb pages are only managed by hugetlbfs, the
> core mm will not drop them.

Right, for this paragraph, how about I rephrase it as below?

  Hugetlb pages are only managed by hugetlbfs, so we're safe even without
  setting dirty bit in the huge pte if the page is installed as read-only.
  However we'd better still keep the dirty bit set for a read-only UFFDIO_COPY
  pte (when UFFDIO_COPY_MODE_WP bit is set), not only to match what we do with
  shmem, but also because the page does contain dirty data that the kernel just
  copied from the userspace.

> 
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  include/asm-generic/hugetlb.h |  5 +++++
> >  include/linux/hugetlb.h       |  6 ++++--
> >  mm/hugetlb.c                  | 22 +++++++++++++++++-----
> >  mm/userfaultfd.c              | 12 ++++++++----
> >  4 files changed, 34 insertions(+), 11 deletions(-)
> > 
> > diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> > index 8e1e6244a89d..548212eccbd6 100644
> > --- a/include/asm-generic/hugetlb.h
> > +++ b/include/asm-generic/hugetlb.h
> > @@ -27,6 +27,11 @@ static inline pte_t huge_pte_mkdirty(pte_t pte)
> >  	return pte_mkdirty(pte);
> >  }
> >  
> > +static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
> > +{
> > +	return pte_mkuffd_wp(pte);
> > +}
> > +
> 
> Just want to verify that userfaultfd wp support is only enabled for
> x86_64 now?  I only ask because there are arch specific hugetlb pte
> manipulation routines for some architectures. 

Yes it's x86_64 only, as we have:

	select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD

Here the helper defines the huge pte uffd-wp bit to be the same as the pte
level bit, across all archs.  However should be fine since for any arch that
does not support uffd-wp, it'll be no-op:

static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
{
	return pte;
}

Meanwhile it shouldn't be called anywhere as well.

Here I can also define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP, however I didn't do so
as I thought above should be enough.  Then we can define it when really
necessary.

> 
> >  static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
> >  {
> >  	return pte_modify(pte, newprot);
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index a7f7d5f328dc..ef8d2b8427b1 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -141,7 +141,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
> >  				unsigned long dst_addr,
> >  				unsigned long src_addr,
> >  				enum mcopy_atomic_mode mode,
> > -				struct page **pagep);
> > +				struct page **pagep,
> > +				bool wp_copy);
> >  #endif /* CONFIG_USERFAULTFD */
> >  bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
> >  						struct vm_area_struct *vma,
> > @@ -321,7 +322,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  						unsigned long dst_addr,
> >  						unsigned long src_addr,
> >  						enum mcopy_atomic_mode mode,
> > -						struct page **pagep)
> > +						struct page **pagep,
> > +						bool wp_copy)
> >  {
> >  	BUG();
> >  	return 0;
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index def2c7ddf3ae..f0e55b341ebd 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -4725,7 +4725,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  			    unsigned long dst_addr,
> >  			    unsigned long src_addr,
> >  			    enum mcopy_atomic_mode mode,
> > -			    struct page **pagep)
> > +			    struct page **pagep,
> > +			    bool wp_copy)
> >  {
> >  	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
> >  	struct address_space *mapping;
> > @@ -4822,17 +4823,28 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
> >  	}
> >  
> > -	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
> > -	if (is_continue && !vm_shared)
> > +	/*
> > +	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
> > +	 * with wp flag set, don't set pte write bit.
> > +	 */
> > +	if (wp_copy || (is_continue && !vm_shared))
> >  		writable = 0;
> >  	else
> >  		writable = dst_vma->vm_flags & VM_WRITE;
> >  
> >  	_dst_pte = make_huge_pte(dst_vma, page, writable);
> > -	if (writable)
> > -		_dst_pte = huge_pte_mkdirty(_dst_pte);
> > +	/*
> > +	 * Always mark UFFDIO_COPY page dirty; note that this may not be
> > +	 * extremely important for hugetlbfs for now since swapping is not
> > +	 * supported, but we should still be clear in that this page cannot be
> > +	 * thrown away at will, even if write bit not set.
> 
> As mentioned earlier there should not be any issue with hugetlb pages
> being thrown away without dirty set.  Perhaps, the comment should
> reflect that this is mostly for consistency.

Yes, functional-wise it seems not necessary, however I also think it's a bit
more than being consistency with what we do with shmem (as also rephrased in
above commit message): UFFDIO_COPY page should be marked as dirty by the
definition of "dirty bit", imho, since the data is indeed dirty (kernel copied
that data from userspace)?

> 
> Note to self: this may help when I get back to hugetlb soft dirty
> support.
> 
> Other than that, patch looks good.

Thanks!
diff mbox series

Patch

diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 8e1e6244a89d..548212eccbd6 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -27,6 +27,11 @@  static inline pte_t huge_pte_mkdirty(pte_t pte)
 	return pte_mkdirty(pte);
 }
 
+static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
+{
+	return pte_mkuffd_wp(pte);
+}
+
 static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
 {
 	return pte_modify(pte, newprot);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a7f7d5f328dc..ef8d2b8427b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -141,7 +141,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				enum mcopy_atomic_mode mode,
-				struct page **pagep);
+				struct page **pagep,
+				bool wp_copy);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -321,7 +322,8 @@  static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						enum mcopy_atomic_mode mode,
-						struct page **pagep)
+						struct page **pagep,
+						bool wp_copy)
 {
 	BUG();
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index def2c7ddf3ae..f0e55b341ebd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4725,7 +4725,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
 			    enum mcopy_atomic_mode mode,
-			    struct page **pagep)
+			    struct page **pagep,
+			    bool wp_copy)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct address_space *mapping;
@@ -4822,17 +4823,28 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
-	if (is_continue && !vm_shared)
+	/*
+	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+	 * with wp flag set, don't set pte write bit.
+	 */
+	if (wp_copy || (is_continue && !vm_shared))
 		writable = 0;
 	else
 		writable = dst_vma->vm_flags & VM_WRITE;
 
 	_dst_pte = make_huge_pte(dst_vma, page, writable);
-	if (writable)
-		_dst_pte = huge_pte_mkdirty(_dst_pte);
+	/*
+	 * Always mark UFFDIO_COPY page dirty; note that this may not be
+	 * extremely important for hugetlbfs for now since swapping is not
+	 * supported, but we should still be clear in that this page cannot be
+	 * thrown away at will, even if write bit not set.
+	 */
+	_dst_pte = huge_pte_mkdirty(_dst_pte);
 	_dst_pte = pte_mkyoung(_dst_pte);
 
+	if (wp_copy)
+		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
 	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0963e0d9ed20..78471ae3d25c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,8 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      enum mcopy_atomic_mode mode)
+					      enum mcopy_atomic_mode mode,
+					      bool wp_copy)
 {
 	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -304,7 +305,8 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-					       dst_addr, src_addr, mode, &page);
+					       dst_addr, src_addr, mode, &page,
+					       wp_copy);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -406,7 +408,8 @@  extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      enum mcopy_atomic_mode mode);
+				      enum mcopy_atomic_mode mode,
+				      bool wp_copy);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -527,7 +530,8 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, mcopy_mode);
+					       src_start, len, mcopy_mode,
+					       wp_copy);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;