diff mbox series

[v5,18/26] hugetlb/userfaultfd: Take care of UFFDIO_COPY_MODE_WP

Message ID 20210715201626.211813-1-peterx@redhat.com (mailing list archive)
State New
Headers show
Series userfaultfd-wp: Support shmem and hugetlbfs | expand

Commit Message

Peter Xu July 15, 2021, 8:16 p.m. UTC
Firstly, pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout
the stack.  Then, apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with
UFFDIO_COPY.  Introduce huge_pte_mkuffd_wp() for it.

Hugetlb pages are only managed by hugetlbfs, so we're safe even without setting
dirty bit in the huge pte if the page is installed as read-only.  However we'd
better still keep the dirty bit set for a read-only UFFDIO_COPY pte (when
UFFDIO_COPY_MODE_WP bit is set), not only to match what we do with shmem, but
also because the page does contain dirty data that the kernel just copied from
the userspace.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/hugetlb.h |  6 ++++--
 mm/hugetlb.c            | 22 +++++++++++++++++-----
 mm/userfaultfd.c        | 12 ++++++++----
 3 files changed, 29 insertions(+), 11 deletions(-)

Comments

kernel test robot July 20, 2021, 11:59 p.m. UTC | #1
Hi Peter,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kselftest/next]
[also build test ERROR on linus/master v5.14-rc2 next-20210720]
[cannot apply to hnaz-linux-mm/master asm-generic/master arm64/for-next/core linux/master tip/x86/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Peter-Xu/userfaultfd-wp-Support-shmem-and-hugetlbfs/20210716-041947
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: s390-randconfig-r023-20210716 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 5d5b08761f944d5b9822d582378333cc4b36a0a7)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/0day-ci/linux/commit/2ad11be7ccbb4fada15c5ec48a35630d450fc9ca
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Peter-Xu/userfaultfd-wp-Support-shmem-and-hugetlbfs/20210716-041947
        git checkout 2ad11be7ccbb4fada15c5ec48a35630d450fc9ca
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=s390 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:464:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __raw_readb(PCI_IOBASE + addr);
                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:477:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:36:59: note: expanded from macro '__le16_to_cpu'
   #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
                                                             ^
   include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
   #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
                                                        ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
   #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
                                                        ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:501:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:511:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:521:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:609:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsb(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:617:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsw(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:625:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsl(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:634:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesb(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:643:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesw(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:652:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesl(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   mm/hugetlb.c:5063:29: error: implicit declaration of function 'huge_pte_uffd_wp' [-Werror,-Wimplicit-function-declaration]
           if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
                                      ^
>> mm/hugetlb.c:5301:14: error: implicit declaration of function 'huge_pte_mkuffd_wp' [-Werror,-Wimplicit-function-declaration]
                   _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
                              ^
   mm/hugetlb.c:5301:14: note: did you mean 'pte_mkuffd_wp'?
   include/asm-generic/pgtable_uffd.h:18:30: note: 'pte_mkuffd_wp' declared here
   static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
                                ^
>> mm/hugetlb.c:5301:12: error: assigning to 'pte_t' from incompatible type 'int'
                   _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
                            ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   12 warnings and 3 errors generated.


vim +/huge_pte_mkuffd_wp +5301 mm/hugetlb.c

  5132	
  5133	#ifdef CONFIG_USERFAULTFD
  5134	/*
  5135	 * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  5136	 * modifications for huge pages.
  5137	 */
  5138	int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
  5139				    pte_t *dst_pte,
  5140				    struct vm_area_struct *dst_vma,
  5141				    unsigned long dst_addr,
  5142				    unsigned long src_addr,
  5143				    enum mcopy_atomic_mode mode,
  5144				    struct page **pagep,
  5145				    bool wp_copy)
  5146	{
  5147		bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
  5148		struct hstate *h = hstate_vma(dst_vma);
  5149		struct address_space *mapping = dst_vma->vm_file->f_mapping;
  5150		pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
  5151		unsigned long size;
  5152		int vm_shared = dst_vma->vm_flags & VM_SHARED;
  5153		pte_t _dst_pte;
  5154		spinlock_t *ptl;
  5155		int ret = -ENOMEM;
  5156		struct page *page;
  5157		int writable;
  5158	
  5159		if (is_continue) {
  5160			ret = -EFAULT;
  5161			page = find_lock_page(mapping, idx);
  5162			if (!page)
  5163				goto out;
  5164		} else if (!*pagep) {
  5165			/* If a page already exists, then it's UFFDIO_COPY for
  5166			 * a non-missing case. Return -EEXIST.
  5167			 */
  5168			if (vm_shared &&
  5169			    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
  5170				ret = -EEXIST;
  5171				goto out;
  5172			}
  5173	
  5174			page = alloc_huge_page(dst_vma, dst_addr, 0);
  5175			if (IS_ERR(page)) {
  5176				ret = -ENOMEM;
  5177				goto out;
  5178			}
  5179	
  5180			ret = copy_huge_page_from_user(page,
  5181							(const void __user *) src_addr,
  5182							pages_per_huge_page(h), false);
  5183	
  5184			/* fallback to copy_from_user outside mmap_lock */
  5185			if (unlikely(ret)) {
  5186				ret = -ENOENT;
  5187				/* Free the allocated page which may have
  5188				 * consumed a reservation.
  5189				 */
  5190				restore_reserve_on_error(h, dst_vma, dst_addr, page);
  5191				put_page(page);
  5192	
  5193				/* Allocate a temporary page to hold the copied
  5194				 * contents.
  5195				 */
  5196				page = alloc_huge_page_vma(h, dst_vma, dst_addr);
  5197				if (!page) {
  5198					ret = -ENOMEM;
  5199					goto out;
  5200				}
  5201				*pagep = page;
  5202				/* Set the outparam pagep and return to the caller to
  5203				 * copy the contents outside the lock. Don't free the
  5204				 * page.
  5205				 */
  5206				goto out;
  5207			}
  5208		} else {
  5209			if (vm_shared &&
  5210			    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
  5211				put_page(*pagep);
  5212				ret = -EEXIST;
  5213				*pagep = NULL;
  5214				goto out;
  5215			}
  5216	
  5217			page = alloc_huge_page(dst_vma, dst_addr, 0);
  5218			if (IS_ERR(page)) {
  5219				ret = -ENOMEM;
  5220				*pagep = NULL;
  5221				goto out;
  5222			}
  5223			copy_huge_page(page, *pagep);
  5224			put_page(*pagep);
  5225			*pagep = NULL;
  5226		}
  5227	
  5228		/*
  5229		 * The memory barrier inside __SetPageUptodate makes sure that
  5230		 * preceding stores to the page contents become visible before
  5231		 * the set_pte_at() write.
  5232		 */
  5233		__SetPageUptodate(page);
  5234	
  5235		/* Add shared, newly allocated pages to the page cache. */
  5236		if (vm_shared && !is_continue) {
  5237			size = i_size_read(mapping->host) >> huge_page_shift(h);
  5238			ret = -EFAULT;
  5239			if (idx >= size)
  5240				goto out_release_nounlock;
  5241	
  5242			/*
  5243			 * Serialization between remove_inode_hugepages() and
  5244			 * huge_add_to_page_cache() below happens through the
  5245			 * hugetlb_fault_mutex_table that here must be hold by
  5246			 * the caller.
  5247			 */
  5248			ret = huge_add_to_page_cache(page, mapping, idx);
  5249			if (ret)
  5250				goto out_release_nounlock;
  5251		}
  5252	
  5253		ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
  5254		spin_lock(ptl);
  5255	
  5256		/*
  5257		 * Recheck the i_size after holding PT lock to make sure not
  5258		 * to leave any page mapped (as page_mapped()) beyond the end
  5259		 * of the i_size (remove_inode_hugepages() is strict about
  5260		 * enforcing that). If we bail out here, we'll also leave a
  5261		 * page in the radix tree in the vm_shared case beyond the end
  5262		 * of the i_size, but remove_inode_hugepages() will take care
  5263		 * of it as soon as we drop the hugetlb_fault_mutex_table.
  5264		 */
  5265		size = i_size_read(mapping->host) >> huge_page_shift(h);
  5266		ret = -EFAULT;
  5267		if (idx >= size)
  5268			goto out_release_unlock;
  5269	
  5270		ret = -EEXIST;
  5271		if (!huge_pte_none(huge_ptep_get(dst_pte)))
  5272			goto out_release_unlock;
  5273	
  5274		if (vm_shared) {
  5275			page_dup_rmap(page, true);
  5276		} else {
  5277			ClearHPageRestoreReserve(page);
  5278			hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
  5279		}
  5280	
  5281		/*
  5282		 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
  5283		 * with wp flag set, don't set pte write bit.
  5284		 */
  5285		if (wp_copy || (is_continue && !vm_shared))
  5286			writable = 0;
  5287		else
  5288			writable = dst_vma->vm_flags & VM_WRITE;
  5289	
  5290		_dst_pte = make_huge_pte(dst_vma, page, writable);
  5291		/*
  5292		 * Always mark UFFDIO_COPY page dirty; note that this may not be
  5293		 * extremely important for hugetlbfs for now since swapping is not
  5294		 * supported, but we should still be clear in that this page cannot be
  5295		 * thrown away at will, even if write bit not set.
  5296		 */
  5297		_dst_pte = huge_pte_mkdirty(_dst_pte);
  5298		_dst_pte = pte_mkyoung(_dst_pte);
  5299	
  5300		if (wp_copy)
> 5301			_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
  5302	
  5303		set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  5304	
  5305		(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
  5306						dst_vma->vm_flags & VM_WRITE);
  5307		hugetlb_count_add(pages_per_huge_page(h), dst_mm);
  5308	
  5309		/* No need to invalidate - it was non-present before */
  5310		update_mmu_cache(dst_vma, dst_addr, dst_pte);
  5311	
  5312		spin_unlock(ptl);
  5313		if (!is_continue)
  5314			SetHPageMigratable(page);
  5315		if (vm_shared || is_continue)
  5316			unlock_page(page);
  5317		ret = 0;
  5318	out:
  5319		return ret;
  5320	out_release_unlock:
  5321		spin_unlock(ptl);
  5322		if (vm_shared || is_continue)
  5323			unlock_page(page);
  5324	out_release_nounlock:
  5325		restore_reserve_on_error(h, dst_vma, dst_addr, page);
  5326		put_page(page);
  5327		goto out;
  5328	}
  5329	#endif /* CONFIG_USERFAULTFD */
  5330	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c30f39815e13..fcdbf9f46d85 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -155,7 +155,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				enum mcopy_atomic_mode mode,
-				struct page **pagep);
+				struct page **pagep,
+				bool wp_copy);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -336,7 +337,8 @@  static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						enum mcopy_atomic_mode mode,
-						struct page **pagep)
+						struct page **pagep,
+						bool wp_copy)
 {
 	BUG();
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d34636085eaf..880cb2137d04 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5141,7 +5141,8 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
 			    enum mcopy_atomic_mode mode,
-			    struct page **pagep)
+			    struct page **pagep,
+			    bool wp_copy)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct hstate *h = hstate_vma(dst_vma);
@@ -5277,17 +5278,28 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
-	if (is_continue && !vm_shared)
+	/*
+	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+	 * with wp flag set, don't set pte write bit.
+	 */
+	if (wp_copy || (is_continue && !vm_shared))
 		writable = 0;
 	else
 		writable = dst_vma->vm_flags & VM_WRITE;
 
 	_dst_pte = make_huge_pte(dst_vma, page, writable);
-	if (writable)
-		_dst_pte = huge_pte_mkdirty(_dst_pte);
+	/*
+	 * Always mark UFFDIO_COPY page dirty; note that this may not be
+	 * extremely important for hugetlbfs for now since swapping is not
+	 * supported, but we should still be clear in that this page cannot be
+	 * thrown away at will, even if write bit not set.
+	 */
+	_dst_pte = huge_pte_mkdirty(_dst_pte);
 	_dst_pte = pte_mkyoung(_dst_pte);
 
+	if (wp_copy)
+		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
 	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0c7212dfb95d..501d6b9f7a5a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -297,7 +297,8 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      enum mcopy_atomic_mode mode)
+					      enum mcopy_atomic_mode mode,
+					      bool wp_copy)
 {
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
@@ -393,7 +394,8 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-					       dst_addr, src_addr, mode, &page);
+					       dst_addr, src_addr, mode, &page,
+					       wp_copy);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -448,7 +450,8 @@  extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      enum mcopy_atomic_mode mode);
+				      enum mcopy_atomic_mode mode,
+				      bool wp_copy);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -568,7 +571,8 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, mcopy_mode);
+					       src_start, len, mcopy_mode,
+					       wp_copy);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;