diff mbox series

[1/3] mm/gup: fix races when looking up a CONT-PTE size hugetlb page

Message ID 0f3df6604059011bf78a286c2cf5da5c4b41ccb1.1660902741.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series Fix some issues when looking up hugetlb page | expand

Commit Message

Baolin Wang Aug. 19, 2022, 10:12 a.m. UTC
On some architectures (like ARM64), it can support CONT-PTE/PMD size
hugetlb, which means it can support not only PMD/PUD size hugetlb:
2M and 1G, but also CONT-PTE/PMD size: 64K and 32M if a 4K page size
specified.

When looking up a CONT-PTE size hugetlb page by follow_page(), it will
use pte_offset_map_lock() to get the pte lock for the CONT-PTE size
hugetlb in follow_page_pte(). However this pte lock is incorrect for
the CONT-PTE size hugetlb, since we should use mm->page_table_lock
by huge_pte_lockptr().

That means the pte entry of the CONT-PTE size hugetlb under current
pte lock is unstable in follow_page_pte(), we can continue to migrate
or poison the pte entry of the CONT-PTE size hugetlb, which can cause
some potential race issues, since the pte entry is unstable, and
following pte_xxx() validation is also incorrect in follow_page_pte(),
even though they are under the 'pte lock'.

To fix this issue, we should validate if it is a CONT-PTE size VMA
at first, and use huge_pte_lockptr() to get the correct pte lock
and get the pte value by huge_ptep_get() to make the pte entry stable
under the correct pte lock.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/gup.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

Comments

kernel test robot Aug. 19, 2022, 4:34 p.m. UTC | #1
Hi Baolin,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc1 next-20220819]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Baolin-Wang/Fix-some-issues-when-looking-up-hugetlb-page/20220819-182017
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: microblaze-randconfig-r003-20220819 (https://download.01.org/0day-ci/archive/20220820/202208200030.9j5HjdtZ-lkp@intel.com/config)
compiler: microblaze-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/c0add09e9de4b39c58633c89ea25ba10ed12d134
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Baolin-Wang/Fix-some-issues-when-looking-up-hugetlb-page/20220819-182017
        git checkout c0add09e9de4b39c58633c89ea25ba10ed12d134
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=microblaze SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/gup.c: In function 'follow_page_pte':
>> mm/gup.c:551:23: error: implicit declaration of function 'huge_ptep_get' [-Werror=implicit-function-declaration]
     551 |                 pte = huge_ptep_get(ptep);
         |                       ^~~~~~~~~~~~~
>> mm/gup.c:551:23: error: incompatible types when assigning to type 'pte_t' from type 'int'
   cc1: some warnings being treated as errors


vim +/huge_ptep_get +551 mm/gup.c

   518	
   519	static struct page *follow_page_pte(struct vm_area_struct *vma,
   520			unsigned long address, pmd_t *pmd, unsigned int flags,
   521			struct dev_pagemap **pgmap)
   522	{
   523		struct mm_struct *mm = vma->vm_mm;
   524		struct page *page;
   525		spinlock_t *ptl;
   526		pte_t *ptep, pte;
   527		int ret;
   528	
   529		/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   530		if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
   531				 (FOLL_PIN | FOLL_GET)))
   532			return ERR_PTR(-EINVAL);
   533	retry:
   534		if (unlikely(pmd_bad(*pmd)))
   535			return no_page_table(vma, flags);
   536	
   537		/*
   538		 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
   539		 * ARM64 architecture.
   540		 */
   541		if (is_vm_hugetlb_page(vma)) {
   542			struct hstate *hstate = hstate_vma(vma);
   543			unsigned long size = huge_page_size(hstate);
   544	
   545			ptep = huge_pte_offset(mm, address, size);
   546			if (!ptep)
   547				return no_page_table(vma, flags);
   548	
   549			ptl = huge_pte_lockptr(hstate, mm, ptep);
   550			spin_lock(ptl);
 > 551			pte = huge_ptep_get(ptep);
   552		} else {
   553			ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
   554			pte = *ptep;
   555		}
   556	
   557		if (!pte_present(pte)) {
   558			swp_entry_t entry;
   559			/*
   560			 * KSM's break_ksm() relies upon recognizing a ksm page
   561			 * even while it is being migrated, so for that case we
   562			 * need migration_entry_wait().
   563			 */
   564			if (likely(!(flags & FOLL_MIGRATION)))
   565				goto no_page;
   566			if (pte_none(pte))
   567				goto no_page;
   568			entry = pte_to_swp_entry(pte);
   569			if (!is_migration_entry(entry))
   570				goto no_page;
   571			pte_unmap_unlock(ptep, ptl);
   572			migration_entry_wait(mm, pmd, address);
   573			goto retry;
   574		}
   575		if ((flags & FOLL_NUMA) && pte_protnone(pte))
   576			goto no_page;
   577	
   578		page = vm_normal_page(vma, address, pte);
   579	
   580		/*
   581		 * We only care about anon pages in can_follow_write_pte() and don't
   582		 * have to worry about pte_devmap() because they are never anon.
   583		 */
   584		if ((flags & FOLL_WRITE) &&
   585		    !can_follow_write_pte(pte, page, vma, flags)) {
   586			page = NULL;
   587			goto out;
   588		}
   589	
   590		if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
   591			/*
   592			 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
   593			 * case since they are only valid while holding the pgmap
   594			 * reference.
   595			 */
   596			*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
   597			if (*pgmap)
   598				page = pte_page(pte);
   599			else
   600				goto no_page;
   601		} else if (unlikely(!page)) {
   602			if (flags & FOLL_DUMP) {
   603				/* Avoid special (like zero) pages in core dumps */
   604				page = ERR_PTR(-EFAULT);
   605				goto out;
   606			}
   607	
   608			if (is_zero_pfn(pte_pfn(pte))) {
   609				page = pte_page(pte);
   610			} else {
   611				ret = follow_pfn_pte(vma, address, ptep, flags);
   612				page = ERR_PTR(ret);
   613				goto out;
   614			}
   615		}
   616	
   617		if (!pte_write(pte) && gup_must_unshare(flags, page)) {
   618			page = ERR_PTR(-EMLINK);
   619			goto out;
   620		}
   621	
   622		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
   623			       !PageAnonExclusive(page), page);
   624	
   625		/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
   626		if (unlikely(!try_grab_page(page, flags))) {
   627			page = ERR_PTR(-ENOMEM);
   628			goto out;
   629		}
   630		/*
   631		 * We need to make the page accessible if and only if we are going
   632		 * to access its content (the FOLL_PIN case).  Please see
   633		 * Documentation/core-api/pin_user_pages.rst for details.
   634		 */
   635		if (flags & FOLL_PIN) {
   636			ret = arch_make_page_accessible(page);
   637			if (ret) {
   638				unpin_user_page(page);
   639				page = ERR_PTR(ret);
   640				goto out;
   641			}
   642		}
   643		if (flags & FOLL_TOUCH) {
   644			if ((flags & FOLL_WRITE) &&
   645			    !pte_dirty(pte) && !PageDirty(page))
   646				set_page_dirty(page);
   647			/*
   648			 * pte_mkyoung() would be more correct here, but atomic care
   649			 * is needed to avoid losing the dirty bit: it is easier to use
   650			 * mark_page_accessed().
   651			 */
   652			mark_page_accessed(page);
   653		}
   654	out:
   655		pte_unmap_unlock(ptep, ptl);
   656		return page;
   657	no_page:
   658		pte_unmap_unlock(ptep, ptl);
   659		if (!pte_none(pte))
   660			return NULL;
   661		return no_page_table(vma, flags);
   662	}
   663
kernel test robot Aug. 19, 2022, 5:46 p.m. UTC | #2
Hi Baolin,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc1 next-20220819]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Baolin-Wang/Fix-some-issues-when-looking-up-hugetlb-page/20220819-182017
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: riscv-randconfig-r042-20220819 (https://download.01.org/0day-ci/archive/20220820/202208200109.XEXN0wPy-lkp@intel.com/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 0ac597f3cacf60479ffd36b03766fa7462dabd78)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/c0add09e9de4b39c58633c89ea25ba10ed12d134
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Baolin-Wang/Fix-some-issues-when-looking-up-hugetlb-page/20220819-182017
        git checkout c0add09e9de4b39c58633c89ea25ba10ed12d134
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> mm/gup.c:551:9: error: implicit declaration of function 'huge_ptep_get' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
                   pte = huge_ptep_get(ptep);
                         ^
>> mm/gup.c:551:7: error: assigning to 'pte_t' from incompatible type 'int'
                   pte = huge_ptep_get(ptep);
                       ^ ~~~~~~~~~~~~~~~~~~~
   2 errors generated.


vim +/huge_ptep_get +551 mm/gup.c

   518	
   519	static struct page *follow_page_pte(struct vm_area_struct *vma,
   520			unsigned long address, pmd_t *pmd, unsigned int flags,
   521			struct dev_pagemap **pgmap)
   522	{
   523		struct mm_struct *mm = vma->vm_mm;
   524		struct page *page;
   525		spinlock_t *ptl;
   526		pte_t *ptep, pte;
   527		int ret;
   528	
   529		/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   530		if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
   531				 (FOLL_PIN | FOLL_GET)))
   532			return ERR_PTR(-EINVAL);
   533	retry:
   534		if (unlikely(pmd_bad(*pmd)))
   535			return no_page_table(vma, flags);
   536	
   537		/*
   538		 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
   539		 * ARM64 architecture.
   540		 */
   541		if (is_vm_hugetlb_page(vma)) {
   542			struct hstate *hstate = hstate_vma(vma);
   543			unsigned long size = huge_page_size(hstate);
   544	
   545			ptep = huge_pte_offset(mm, address, size);
   546			if (!ptep)
   547				return no_page_table(vma, flags);
   548	
   549			ptl = huge_pte_lockptr(hstate, mm, ptep);
   550			spin_lock(ptl);
 > 551			pte = huge_ptep_get(ptep);
   552		} else {
   553			ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
   554			pte = *ptep;
   555		}
   556	
   557		if (!pte_present(pte)) {
   558			swp_entry_t entry;
   559			/*
   560			 * KSM's break_ksm() relies upon recognizing a ksm page
   561			 * even while it is being migrated, so for that case we
   562			 * need migration_entry_wait().
   563			 */
   564			if (likely(!(flags & FOLL_MIGRATION)))
   565				goto no_page;
   566			if (pte_none(pte))
   567				goto no_page;
   568			entry = pte_to_swp_entry(pte);
   569			if (!is_migration_entry(entry))
   570				goto no_page;
   571			pte_unmap_unlock(ptep, ptl);
   572			migration_entry_wait(mm, pmd, address);
   573			goto retry;
   574		}
   575		if ((flags & FOLL_NUMA) && pte_protnone(pte))
   576			goto no_page;
   577	
   578		page = vm_normal_page(vma, address, pte);
   579	
   580		/*
   581		 * We only care about anon pages in can_follow_write_pte() and don't
   582		 * have to worry about pte_devmap() because they are never anon.
   583		 */
   584		if ((flags & FOLL_WRITE) &&
   585		    !can_follow_write_pte(pte, page, vma, flags)) {
   586			page = NULL;
   587			goto out;
   588		}
   589	
   590		if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
   591			/*
   592			 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
   593			 * case since they are only valid while holding the pgmap
   594			 * reference.
   595			 */
   596			*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
   597			if (*pgmap)
   598				page = pte_page(pte);
   599			else
   600				goto no_page;
   601		} else if (unlikely(!page)) {
   602			if (flags & FOLL_DUMP) {
   603				/* Avoid special (like zero) pages in core dumps */
   604				page = ERR_PTR(-EFAULT);
   605				goto out;
   606			}
   607	
   608			if (is_zero_pfn(pte_pfn(pte))) {
   609				page = pte_page(pte);
   610			} else {
   611				ret = follow_pfn_pte(vma, address, ptep, flags);
   612				page = ERR_PTR(ret);
   613				goto out;
   614			}
   615		}
   616	
   617		if (!pte_write(pte) && gup_must_unshare(flags, page)) {
   618			page = ERR_PTR(-EMLINK);
   619			goto out;
   620		}
   621	
   622		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
   623			       !PageAnonExclusive(page), page);
   624	
   625		/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
   626		if (unlikely(!try_grab_page(page, flags))) {
   627			page = ERR_PTR(-ENOMEM);
   628			goto out;
   629		}
   630		/*
   631		 * We need to make the page accessible if and only if we are going
   632		 * to access its content (the FOLL_PIN case).  Please see
   633		 * Documentation/core-api/pin_user_pages.rst for details.
   634		 */
   635		if (flags & FOLL_PIN) {
   636			ret = arch_make_page_accessible(page);
   637			if (ret) {
   638				unpin_user_page(page);
   639				page = ERR_PTR(ret);
   640				goto out;
   641			}
   642		}
   643		if (flags & FOLL_TOUCH) {
   644			if ((flags & FOLL_WRITE) &&
   645			    !pte_dirty(pte) && !PageDirty(page))
   646				set_page_dirty(page);
   647			/*
   648			 * pte_mkyoung() would be more correct here, but atomic care
   649			 * is needed to avoid losing the dirty bit: it is easier to use
   650			 * mark_page_accessed().
   651			 */
   652			mark_page_accessed(page);
   653		}
   654	out:
   655		pte_unmap_unlock(ptep, ptl);
   656		return page;
   657	no_page:
   658		pte_unmap_unlock(ptep, ptl);
   659		if (!pte_none(pte))
   660			return NULL;
   661		return no_page_table(vma, flags);
   662	}
   663
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index 5aa7531..3b2fa86 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -534,8 +534,26 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	if (unlikely(pmd_bad(*pmd)))
 		return no_page_table(vma, flags);
 
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-	pte = *ptep;
+	/*
+	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+	 * ARM64 architecture.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		struct hstate *hstate = hstate_vma(vma);
+		unsigned long size = huge_page_size(hstate);
+
+		ptep = huge_pte_offset(mm, address, size);
+		if (!ptep)
+			return no_page_table(vma, flags);
+
+		ptl = huge_pte_lockptr(hstate, mm, ptep);
+		spin_lock(ptl);
+		pte = huge_ptep_get(ptep);
+	} else {
+		ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+		pte = *ptep;
+	}
+
 	if (!pte_present(pte)) {
 		swp_entry_t entry;
 		/*