diff mbox series

[-next,v4,3/4] arm64: mm: add support for page table check

Message ID 20220418034444.520928-4-tongtiangen@huawei.com (mailing list archive)
State New, archived
Headers show
Series mm: page_table_check: add support on arm64 and riscv | expand

Commit Message

Tong Tiangen April 18, 2022, 3:44 a.m. UTC
From: Kefeng Wang <wangkefeng.wang@huawei.com>

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), add some necessary page table check hooks into routines that
modify user page tables.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 arch/arm64/Kconfig               |  1 +
 arch/arm64/include/asm/pgtable.h | 65 +++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 5 deletions(-)

Comments

Anshuman Khandual April 18, 2022, 9:28 a.m. UTC | #1
On 4/18/22 09:14, Tong Tiangen wrote:
> From: Kefeng Wang <wangkefeng.wang@huawei.com>
> 
> As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
> check"), add some necessary page table check hooks into routines that
> modify user page tables.
> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
>  arch/arm64/Kconfig               |  1 +
>  arch/arm64/include/asm/pgtable.h | 65 +++++++++++++++++++++++++++++---
>  2 files changed, 61 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index e80fd2372f02..7114d2d5155e 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -92,6 +92,7 @@ config ARM64
>  	select ARCH_SUPPORTS_ATOMIC_RMW
>  	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
>  	select ARCH_SUPPORTS_NUMA_BALANCING
> +	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
>  	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
>  	select ARCH_WANT_DEFAULT_BPF_JIT
>  	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 930077f7b572..9f8f97a7cc7c 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -33,6 +33,7 @@
>  #include <linux/mmdebug.h>
>  #include <linux/mm_types.h>
>  #include <linux/sched.h>
> +#include <linux/page_table_check.h>
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
> @@ -96,6 +97,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>  #define pte_young(pte)		(!!(pte_val(pte) & PTE_AF))
>  #define pte_special(pte)	(!!(pte_val(pte) & PTE_SPECIAL))
>  #define pte_write(pte)		(!!(pte_val(pte) & PTE_WRITE))
> +#define pte_user(pte)		(!!(pte_val(pte) & PTE_USER))
>  #define pte_user_exec(pte)	(!(pte_val(pte) & PTE_UXN))
>  #define pte_cont(pte)		(!!(pte_val(pte) & PTE_CONT))
>  #define pte_devmap(pte)		(!!(pte_val(pte) & PTE_DEVMAP))
> @@ -312,7 +314,7 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
>  		     __func__, pte_val(old_pte), pte_val(pte));
>  }
>  
> -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
> +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>  			      pte_t *ptep, pte_t pte)
>  {
>  	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
> @@ -343,6 +345,13 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
>  	set_pte(ptep, pte);
>  }
>  
> +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
> +			      pte_t *ptep, pte_t pte)
> +{
> +	page_table_check_pte_set(mm, addr, ptep, pte);
> +	return __set_pte_at(mm, addr, ptep, pte);
> +}
> +
>  /*
>   * Huge pte definitions.
>   */
> @@ -454,6 +463,8 @@ static inline int pmd_trans_huge(pmd_t pmd)
>  #define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
>  #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
>  #define pmd_valid(pmd)		pte_valid(pmd_pte(pmd))
> +#define pmd_user(pmd)		pte_user(pmd_pte(pmd))
> +#define pmd_user_exec(pmd)	pte_user_exec(pmd_pte(pmd))
>  #define pmd_cont(pmd)		pte_cont(pmd_pte(pmd))
>  #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
>  #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
> @@ -501,8 +512,19 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
>  #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
>  #define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
>  
> -#define set_pmd_at(mm, addr, pmdp, pmd)	set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
> -#define set_pud_at(mm, addr, pudp, pud)	set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud))
> +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> +			      pmd_t *pmdp, pmd_t pmd)
> +{
> +	page_table_check_pmd_set(mm, addr, pmdp, pmd);
> +	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
> +}
> +
> +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +			      pud_t *pudp, pud_t pud)
> +{
> +	page_table_check_pud_set(mm, addr, pudp, pud);
> +	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
> +}
>  
>  #define __p4d_to_phys(p4d)	__pte_to_phys(p4d_pte(p4d))
>  #define __phys_to_p4d_val(phys)	__phys_to_pte_val(phys)
> @@ -643,6 +665,24 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
>  #define pud_present(pud)	pte_present(pud_pte(pud))
>  #define pud_leaf(pud)		pud_sect(pud)
>  #define pud_valid(pud)		pte_valid(pud_pte(pud))
> +#define pud_user(pud)		pte_user(pud_pte(pud))
> +
> +#ifdef CONFIG_PAGE_TABLE_CHECK
> +static inline bool pte_user_accessible_page(pte_t pte)
> +{
> +	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
> +}
> +
> +static inline bool pmd_user_accessible_page(pmd_t pmd)
> +{
> +	return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
> +}
> +
> +static inline bool pud_user_accessible_page(pud_t pud)
> +{
> +	return pud_present(pud) && pud_user(pud);
> +}
> +#endif
>  
>  static inline void set_pud(pud_t *pudp, pud_t pud)
>  {
> @@ -872,11 +912,21 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
> +				       unsigned long address, pte_t *ptep)
> +{
> +	return __pte(xchg_relaxed(&pte_val(*ptep), 0));
> +}
> +
>  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
>  static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  				       unsigned long address, pte_t *ptep)
>  {
> -	return __pte(xchg_relaxed(&pte_val(*ptep), 0));
> +	pte_t pte = __ptep_get_and_clear(mm, address, ptep);
> +
> +	page_table_check_pte_clear(mm, address, pte);
> +
> +	return pte;
>  }
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -884,7 +934,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
>  					    unsigned long address, pmd_t *pmdp)
>  {
> -	return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
> +	pmd_t pmd = pte_pmd(__ptep_get_and_clear(mm, address, (pte_t *)pmdp));
> +
> +	page_table_check_pmd_clear(mm, address, pmd);
> +
> +	return pmd;
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> @@ -918,6 +972,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
>  static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>  		unsigned long address, pmd_t *pmdp, pmd_t pmd)
>  {
> +	page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
>  	return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
>  }
>  #endif

Ran this series on arm64 platform after enabling

- CONFIG_PAGE_TABLE_CHECK
- CONFIG_PAGE_TABLE_CHECK_ENFORCED (avoiding kernel command line option)

After some time, the following error came up

[   23.266013] ------------[ cut here ]------------
[   23.266807] kernel BUG at mm/page_table_check.c:90!
[   23.267609] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[   23.268503] Modules linked in:                                                                    
[   23.269012] CPU: 1 PID: 30 Comm: khugepaged Not tainted 5.18.0-rc3-00004-g60aa8e363a91 #2
[   23.270383] Hardware name: linux,dummy-virt (DT)
[   23.271210] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   23.272445] pc : page_table_check_clear.isra.6+0x114/0x148
[   23.273429] lr : page_table_check_clear.isra.6+0x64/0x148
[   23.274395] sp : ffff80000afb3ca0
[   23.274994] x29: ffff80000afb3ca0 x28: fffffc00022558e8 x27: ffff80000a27f628
[   23.276260] x26: ffff800009f9f2b0 x25: ffff00008a8d5000 x24: ffff800009f09fa0                     
[   23.277527] x23: 0000ffff89e00000 x22: ffff800009f09fb8 x21: ffff000089414cc0
[   23.278798] x20: 0000000000000200 x19: fffffc00022a0000 x18: 0000000000000001
[   23.280066] x17: 0000000000000001 x16: 0000000000000000 x15: 0000000000000003
[   23.281331] x14: 0000000000000068 x13: 00000000000000c0 x12: 0000000000000010
[   23.282602] x11: fffffc0002320008 x10: fffffc0002320000 x9 : ffff800009fa1000
[   23.283868] x8 : 00000000ffffffff x7 : 0000000000000001 x6 : ffff800009fa1f08
[   23.285135] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
[   23.286406] x2 : 00000000ffffffff x1 : ffff000080f2800c x0 : ffff000080f28000
[   23.287673] Call trace:
[   23.288123]  page_table_check_clear.isra.6+0x114/0x148
[   23.289043]  __page_table_check_pmd_clear+0x3c/0x50
[   23.289918]  pmdp_collapse_flush+0x114/0x370
[   23.290692]  khugepaged+0x1170/0x19e0
[   23.291356]  kthread+0x110/0x120
[   23.291945]  ret_from_fork+0x10/0x20
[   23.292596] Code: 91001041 b8e80024 51000482 36fffd62 (d4210000) 
[   23.293678] ---[ end trace 0000000000000000 ]---
[   23.294511] note: khugepaged[30] exited with preempt_count 2

Looking into file mm/page_table_check.c where this problem occured.

/*
 * An enty is removed from the page table, decrement the counters for that page
 * verify that it is of correct type and counters do not become negative.
 */
static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
                                   unsigned long pfn, unsigned long pgcnt)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = lookup_page_ext(page);
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
 Triggered here ====>>  BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
}

Could you explain what was expected during pmdp_collapse_flush() which when
failed, triggered this BUG_ON() ? This counter seems to be page table check
specific, could it just go wrong ? I have not looked into the details about
page table check mechanism.

- Anshuman
Tong Tiangen April 18, 2022, 3:47 p.m. UTC | #2
在 2022/4/18 17:28, Anshuman Khandual 写道:
> On 4/18/22 09:14, Tong Tiangen wrote:
>> From: Kefeng Wang <wangkefeng.wang@huawei.com>
>>
[...]
>>   #endif
> 
> Ran this series on arm64 platform after enabling
> 
> - CONFIG_PAGE_TABLE_CHECK
> - CONFIG_PAGE_TABLE_CHECK_ENFORCED (avoiding kernel command line option)
> 
> After some time, the following error came up
> 
> [   23.266013] ------------[ cut here ]------------
> [   23.266807] kernel BUG at mm/page_table_check.c:90!
> [   23.267609] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
> [   23.268503] Modules linked in:
> [   23.269012] CPU: 1 PID: 30 Comm: khugepaged Not tainted 5.18.0-rc3-00004-g60aa8e363a91 #2
> [   23.270383] Hardware name: linux,dummy-virt (DT)
> [   23.271210] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   23.272445] pc : page_table_check_clear.isra.6+0x114/0x148
> [   23.273429] lr : page_table_check_clear.isra.6+0x64/0x148
> [   23.274395] sp : ffff80000afb3ca0
> [   23.274994] x29: ffff80000afb3ca0 x28: fffffc00022558e8 x27: ffff80000a27f628
> [   23.276260] x26: ffff800009f9f2b0 x25: ffff00008a8d5000 x24: ffff800009f09fa0
> [   23.277527] x23: 0000ffff89e00000 x22: ffff800009f09fb8 x21: ffff000089414cc0
> [   23.278798] x20: 0000000000000200 x19: fffffc00022a0000 x18: 0000000000000001
> [   23.280066] x17: 0000000000000001 x16: 0000000000000000 x15: 0000000000000003
> [   23.281331] x14: 0000000000000068 x13: 00000000000000c0 x12: 0000000000000010
> [   23.282602] x11: fffffc0002320008 x10: fffffc0002320000 x9 : ffff800009fa1000
> [   23.283868] x8 : 00000000ffffffff x7 : 0000000000000001 x6 : ffff800009fa1f08
> [   23.285135] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
> [   23.286406] x2 : 00000000ffffffff x1 : ffff000080f2800c x0 : ffff000080f28000
> [   23.287673] Call trace:
> [   23.288123]  page_table_check_clear.isra.6+0x114/0x148
> [   23.289043]  __page_table_check_pmd_clear+0x3c/0x50
> [   23.289918]  pmdp_collapse_flush+0x114/0x370
> [   23.290692]  khugepaged+0x1170/0x19e0
> [   23.291356]  kthread+0x110/0x120
> [   23.291945]  ret_from_fork+0x10/0x20
> [   23.292596] Code: 91001041 b8e80024 51000482 36fffd62 (d4210000)
> [   23.293678] ---[ end trace 0000000000000000 ]---
> [   23.294511] note: khugepaged[30] exited with preempt_count 2
> 
> Looking into file mm/page_table_check.c where this problem occured.
> 
> /*
>   * An enty is removed from the page table, decrement the counters for that page
>   * verify that it is of correct type and counters do not become negative.
>   */
> static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
>                                     unsigned long pfn, unsigned long pgcnt)
> {
>          struct page_ext *page_ext;
>          struct page *page;
>          unsigned long i;
>          bool anon;
> 
>          if (!pfn_valid(pfn))
>                  return;
> 
>          page = pfn_to_page(pfn);
>          page_ext = lookup_page_ext(page);
>          anon = PageAnon(page);
> 
>          for (i = 0; i < pgcnt; i++) {
>                  struct page_table_check *ptc = get_page_table_check(page_ext);
> 
>                  if (anon) {
>                          BUG_ON(atomic_read(&ptc->file_map_count));
>                          BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
>                  } else {
>                          BUG_ON(atomic_read(&ptc->anon_map_count));
>   Triggered here ====>>  BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>                  }
>                  page_ext = page_ext_next(page_ext);
>          }
> }
> 
> Could you explain what was expected during pmdp_collapse_flush() which when
> failed, triggered this BUG_ON() ? This counter seems to be page table check
> specific, could it just go wrong ? I have not looked into the details about
> page table check mechanism.
> 
> - Anshuman
> .

Hi Anshuman:

Thanks for your job.

Let me briefly explain the principle of page table check(PTC).

PTC introduces the following struct for page mapping type count:
struct page_table_check {
         atomic_t anon_map_count;
         atomic_t file_map_count;
};
This structure can be obtained by "lookup_page_ext(page)"

When page table entries are set(pud/pmd/pte), page_table_check_set()  is 
called to increase the page mapping count, Also check for errors (eg:if 
a page is used for anonymous mapping, then the page cannot be used for 
file mapping at the same time).

When page table entries are clear(pud/pmd/pte), page_table_check_clear() 
  is called to decrease the page mapping count, Also check for errors.

The error check rules are described in the following documents: 
Documentation/vm/page_table_check.rst

The setting and clearing of page table entries are symmetrical.

Here __page_table_check_pmd_clear() trigger BUGON which indicates that 
the pmd entry file mapping count has become negative.

I guess if PTC didn't detect this exception, would there have been any 
problems?

Thanks,
Tong.
Pasha Tatashin April 18, 2022, 4:20 p.m. UTC | #3
On Mon, Apr 18, 2022 at 11:47 AM Tong Tiangen <tongtiangen@huawei.com> wrote:
>
>
>
> 在 2022/4/18 17:28, Anshuman Khandual 写道:
> > On 4/18/22 09:14, Tong Tiangen wrote:
> >> From: Kefeng Wang <wangkefeng.wang@huawei.com>
> >>
> [...]
> >>   #endif
> >
> > Ran this series on arm64 platform after enabling
> >
> > - CONFIG_PAGE_TABLE_CHECK
> > - CONFIG_PAGE_TABLE_CHECK_ENFORCED (avoiding kernel command line option)
> >
> > After some time, the following error came up
> >
> > [   23.266013] ------------[ cut here ]------------
> > [   23.266807] kernel BUG at mm/page_table_check.c:90!
> > [   23.267609] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
> > [   23.268503] Modules linked in:
> > [   23.269012] CPU: 1 PID: 30 Comm: khugepaged Not tainted 5.18.0-rc3-00004-g60aa8e363a91 #2
> > [   23.270383] Hardware name: linux,dummy-virt (DT)
> > [   23.271210] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > [   23.272445] pc : page_table_check_clear.isra.6+0x114/0x148
> > [   23.273429] lr : page_table_check_clear.isra.6+0x64/0x148
> > [   23.274395] sp : ffff80000afb3ca0
> > [   23.274994] x29: ffff80000afb3ca0 x28: fffffc00022558e8 x27: ffff80000a27f628
> > [   23.276260] x26: ffff800009f9f2b0 x25: ffff00008a8d5000 x24: ffff800009f09fa0
> > [   23.277527] x23: 0000ffff89e00000 x22: ffff800009f09fb8 x21: ffff000089414cc0
> > [   23.278798] x20: 0000000000000200 x19: fffffc00022a0000 x18: 0000000000000001
> > [   23.280066] x17: 0000000000000001 x16: 0000000000000000 x15: 0000000000000003
> > [   23.281331] x14: 0000000000000068 x13: 00000000000000c0 x12: 0000000000000010
> > [   23.282602] x11: fffffc0002320008 x10: fffffc0002320000 x9 : ffff800009fa1000
> > [   23.283868] x8 : 00000000ffffffff x7 : 0000000000000001 x6 : ffff800009fa1f08
> > [   23.285135] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
> > [   23.286406] x2 : 00000000ffffffff x1 : ffff000080f2800c x0 : ffff000080f28000
> > [   23.287673] Call trace:
> > [   23.288123]  page_table_check_clear.isra.6+0x114/0x148
> > [   23.289043]  __page_table_check_pmd_clear+0x3c/0x50
> > [   23.289918]  pmdp_collapse_flush+0x114/0x370
> > [   23.290692]  khugepaged+0x1170/0x19e0
> > [   23.291356]  kthread+0x110/0x120
> > [   23.291945]  ret_from_fork+0x10/0x20
> > [   23.292596] Code: 91001041 b8e80024 51000482 36fffd62 (d4210000)
> > [   23.293678] ---[ end trace 0000000000000000 ]---
> > [   23.294511] note: khugepaged[30] exited with preempt_count 2
> >
> > Looking into file mm/page_table_check.c where this problem occured.
> >
> > /*
> >   * An enty is removed from the page table, decrement the counters for that page
> >   * verify that it is of correct type and counters do not become negative.
> >   */
> > static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
> >                                     unsigned long pfn, unsigned long pgcnt)
> > {
> >          struct page_ext *page_ext;
> >          struct page *page;
> >          unsigned long i;
> >          bool anon;
> >
> >          if (!pfn_valid(pfn))
> >                  return;
> >
> >          page = pfn_to_page(pfn);
> >          page_ext = lookup_page_ext(page);
> >          anon = PageAnon(page);
> >
> >          for (i = 0; i < pgcnt; i++) {
> >                  struct page_table_check *ptc = get_page_table_check(page_ext);
> >
> >                  if (anon) {
> >                          BUG_ON(atomic_read(&ptc->file_map_count));
> >                          BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
> >                  } else {
> >                          BUG_ON(atomic_read(&ptc->anon_map_count));
> >   Triggered here ====>>  BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
> >                  }
> >                  page_ext = page_ext_next(page_ext);
> >          }
> > }
> >
> > Could you explain what was expected during pmdp_collapse_flush() which when
> > failed, triggered this BUG_ON() ? This counter seems to be page table check
> > specific, could it just go wrong ? I have not looked into the details about
> > page table check mechanism.
> >
> > - Anshuman
> > .
>
> Hi Anshuman:
>
> Thanks for your job.
>
> Let me briefly explain the principle of page table check(PTC).
>
> PTC introduces the following struct for page mapping type count:
> struct page_table_check {
>          atomic_t anon_map_count;
>          atomic_t file_map_count;
> };
> This structure can be obtained by "lookup_page_ext(page)"
>
> When page table entries are set(pud/pmd/pte), page_table_check_set()  is
> called to increase the page mapping count, Also check for errors (eg:if
> a page is used for anonymous mapping, then the page cannot be used for
> file mapping at the same time).
>
> When page table entries are clear(pud/pmd/pte), page_table_check_clear()
>   is called to decrease the page mapping count, Also check for errors.
>
> The error check rules are described in the following documents:
> Documentation/vm/page_table_check.rst
>
> The setting and clearing of page table entries are symmetrical.
>
> Here __page_table_check_pmd_clear() trigger BUGON which indicates that
> the pmd entry file mapping count has become negative.
>
> I guess if PTC didn't detect this exception, would there have been any
> problems?

It is hard to tell what sort of problem has been detected. More
debugging is needed in order to understand it. A huge file entry is
being removed from the page table. However, at least one sub page of
that entry does not have a record that it was added as a file entry to
the page table. At Google we found a few internal security bugs using
PTCs. However, this being new on ARM64, it is possible that the bug is
in PTC/khugepaged itself.

Anshuman is it possible to repro your scenario in QEMU?

Thank you,
Pasha

>
> Thanks,
> Tong.
Anshuman Khandual April 19, 2022, 7:10 a.m. UTC | #4
On 4/18/22 21:17, Tong Tiangen wrote:
> 
> 
> 在 2022/4/18 17:28, Anshuman Khandual 写道:
>> On 4/18/22 09:14, Tong Tiangen wrote:
>>> From: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>
> [...]
>>>   #endif
>>
>> Ran this series on arm64 platform after enabling
>>
>> - CONFIG_PAGE_TABLE_CHECK
>> - CONFIG_PAGE_TABLE_CHECK_ENFORCED (avoiding kernel command line option)
>>
>> After some time, the following error came up
>>
>> [   23.266013] ------------[ cut here ]------------
>> [   23.266807] kernel BUG at mm/page_table_check.c:90!
>> [   23.267609] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
>> [   23.268503] Modules linked in:
>> [   23.269012] CPU: 1 PID: 30 Comm: khugepaged Not tainted 5.18.0-rc3-00004-g60aa8e363a91 #2
>> [   23.270383] Hardware name: linux,dummy-virt (DT)
>> [   23.271210] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>> [   23.272445] pc : page_table_check_clear.isra.6+0x114/0x148
>> [   23.273429] lr : page_table_check_clear.isra.6+0x64/0x148
>> [   23.274395] sp : ffff80000afb3ca0
>> [   23.274994] x29: ffff80000afb3ca0 x28: fffffc00022558e8 x27: ffff80000a27f628
>> [   23.276260] x26: ffff800009f9f2b0 x25: ffff00008a8d5000 x24: ffff800009f09fa0
>> [   23.277527] x23: 0000ffff89e00000 x22: ffff800009f09fb8 x21: ffff000089414cc0
>> [   23.278798] x20: 0000000000000200 x19: fffffc00022a0000 x18: 0000000000000001
>> [   23.280066] x17: 0000000000000001 x16: 0000000000000000 x15: 0000000000000003
>> [   23.281331] x14: 0000000000000068 x13: 00000000000000c0 x12: 0000000000000010
>> [   23.282602] x11: fffffc0002320008 x10: fffffc0002320000 x9 : ffff800009fa1000
>> [   23.283868] x8 : 00000000ffffffff x7 : 0000000000000001 x6 : ffff800009fa1f08
>> [   23.285135] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
>> [   23.286406] x2 : 00000000ffffffff x1 : ffff000080f2800c x0 : ffff000080f28000
>> [   23.287673] Call trace:
>> [   23.288123]  page_table_check_clear.isra.6+0x114/0x148
>> [   23.289043]  __page_table_check_pmd_clear+0x3c/0x50
>> [   23.289918]  pmdp_collapse_flush+0x114/0x370
>> [   23.290692]  khugepaged+0x1170/0x19e0
>> [   23.291356]  kthread+0x110/0x120
>> [   23.291945]  ret_from_fork+0x10/0x20
>> [   23.292596] Code: 91001041 b8e80024 51000482 36fffd62 (d4210000)
>> [   23.293678] ---[ end trace 0000000000000000 ]---
>> [   23.294511] note: khugepaged[30] exited with preempt_count 2
>>
>> Looking into file mm/page_table_check.c where this problem occured.
>>
>> /*
>>   * An enty is removed from the page table, decrement the counters for that page
>>   * verify that it is of correct type and counters do not become negative.
>>   */
>> static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
>>                                     unsigned long pfn, unsigned long pgcnt)
>> {
>>          struct page_ext *page_ext;
>>          struct page *page;
>>          unsigned long i;
>>          bool anon;
>>
>>          if (!pfn_valid(pfn))
>>                  return;
>>
>>          page = pfn_to_page(pfn);
>>          page_ext = lookup_page_ext(page);
>>          anon = PageAnon(page);
>>
>>          for (i = 0; i < pgcnt; i++) {
>>                  struct page_table_check *ptc = get_page_table_check(page_ext);
>>
>>                  if (anon) {
>>                          BUG_ON(atomic_read(&ptc->file_map_count));
>>                          BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
>>                  } else {
>>                          BUG_ON(atomic_read(&ptc->anon_map_count));
>>   Triggered here ====>>  BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>>                  }
>>                  page_ext = page_ext_next(page_ext);
>>          }
>> }
>>
>> Could you explain what was expected during pmdp_collapse_flush() which when
>> failed, triggered this BUG_ON() ? This counter seems to be page table check
>> specific, could it just go wrong ? I have not looked into the details about
>> page table check mechanism.
>>
>> - Anshuman
>> .
> 
> Hi Anshuman:
> 
> Thanks for your job.
> 
> Let me briefly explain the principle of page table check(PTC).
> 
> PTC introduces the following struct for page mapping type count:
> struct page_table_check {
>         atomic_t anon_map_count;
>         atomic_t file_map_count;
> };
> This structure can be obtained by "lookup_page_ext(page)"


Right.

> 
> When page table entries are set(pud/pmd/pte), page_table_check_set()  is called to increase the page mapping count, Also check for errors (eg:if a page is used for anonymous mapping, then the page cannot be used for file mapping at the same time).
> 
> When page table entries are clear(pud/pmd/pte), page_table_check_clear()  is called to decrease the page mapping count, Also check for errors.
> 
> The error check rules are described in the following documents: Documentation/vm/page_table_check.rst

Snippet from that document.

+-------------------+-------------------+-------------------+------------------+
| Current Mapping   | New mapping       | Permissions       | Rule             |
+===================+===================+===================+==================+
| Anonymous         | Anonymous         | Read              | Allow            |
+-------------------+-------------------+-------------------+------------------+
| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
+-------------------+-------------------+-------------------+------------------+
| Anonymous         | Named             | Any               | Prohibit         |
+-------------------+-------------------+-------------------+------------------+
| Named             | Anonymous         | Any               | Prohibit         |
+-------------------+-------------------+-------------------+------------------+
| Named             | Named             | Any               | Allow            |
+-------------------+-------------------+-------------------+------------------+

Does 'Named' refer to file mapping ? Also what does 'Prohibit' imply here ? The
check will call out a BUG_ON() in such cases ?

page_table_check_clear()
{

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
}

So in the clear path, there are two checks

- If the current mapping is Anon, file_map_count cannot be positive and other way
- Decrement the applicable counter ensuring that it does not turn negative

page_table_check_set()
{
                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
                }
}

So in the set path, there are two checks

- If the current mapping is anon, file_map_count cannot be positive and other way
- Anon mapping cannot be RW if the page has been mapped more than once
- But then why check for negative values for file_map_count after increment ?

Is there any other checks, which this test ensures, that I might be missing ?

> 
> The setting and clearing of page table entries are symmetrical.

This assumption should be true for any user accessible mapping, for this test to work ?

Also why PUD_PAGE_SIZE/PMD_PAGE_SIZE are being used here instead of directly using
generic macros such as PUD_SIZE/PMD_SIZE ? Is there a specific reason ?

> 
> Here __page_table_check_pmd_clear() trigger BUGON which indicates that the pmd entry file mapping count has become negative.
> 
> I guess if PTC didn't detect this exception, would there have been any problems?

I am looking into this, not sure for now.
Anshuman Khandual April 19, 2022, 7:25 a.m. UTC | #5
On 4/18/22 21:50, Pasha Tatashin wrote:
> On Mon, Apr 18, 2022 at 11:47 AM Tong Tiangen <tongtiangen@huawei.com> wrote:
>>
>>
>>
>> 在 2022/4/18 17:28, Anshuman Khandual 写道:
>>> On 4/18/22 09:14, Tong Tiangen wrote:
>>>> From: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>>
>> [...]
>>>>   #endif
>>>
>>> Ran this series on arm64 platform after enabling
>>>
>>> - CONFIG_PAGE_TABLE_CHECK
>>> - CONFIG_PAGE_TABLE_CHECK_ENFORCED (avoiding kernel command line option)
>>>
>>> After some time, the following error came up
>>>
>>> [   23.266013] ------------[ cut here ]------------
>>> [   23.266807] kernel BUG at mm/page_table_check.c:90!
>>> [   23.267609] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
>>> [   23.268503] Modules linked in:
>>> [   23.269012] CPU: 1 PID: 30 Comm: khugepaged Not tainted 5.18.0-rc3-00004-g60aa8e363a91 #2
>>> [   23.270383] Hardware name: linux,dummy-virt (DT)
>>> [   23.271210] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>>> [   23.272445] pc : page_table_check_clear.isra.6+0x114/0x148
>>> [   23.273429] lr : page_table_check_clear.isra.6+0x64/0x148
>>> [   23.274395] sp : ffff80000afb3ca0
>>> [   23.274994] x29: ffff80000afb3ca0 x28: fffffc00022558e8 x27: ffff80000a27f628
>>> [   23.276260] x26: ffff800009f9f2b0 x25: ffff00008a8d5000 x24: ffff800009f09fa0
>>> [   23.277527] x23: 0000ffff89e00000 x22: ffff800009f09fb8 x21: ffff000089414cc0
>>> [   23.278798] x20: 0000000000000200 x19: fffffc00022a0000 x18: 0000000000000001
>>> [   23.280066] x17: 0000000000000001 x16: 0000000000000000 x15: 0000000000000003
>>> [   23.281331] x14: 0000000000000068 x13: 00000000000000c0 x12: 0000000000000010
>>> [   23.282602] x11: fffffc0002320008 x10: fffffc0002320000 x9 : ffff800009fa1000
>>> [   23.283868] x8 : 00000000ffffffff x7 : 0000000000000001 x6 : ffff800009fa1f08
>>> [   23.285135] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
>>> [   23.286406] x2 : 00000000ffffffff x1 : ffff000080f2800c x0 : ffff000080f28000
>>> [   23.287673] Call trace:
>>> [   23.288123]  page_table_check_clear.isra.6+0x114/0x148
>>> [   23.289043]  __page_table_check_pmd_clear+0x3c/0x50
>>> [   23.289918]  pmdp_collapse_flush+0x114/0x370
>>> [   23.290692]  khugepaged+0x1170/0x19e0
>>> [   23.291356]  kthread+0x110/0x120
>>> [   23.291945]  ret_from_fork+0x10/0x20
>>> [   23.292596] Code: 91001041 b8e80024 51000482 36fffd62 (d4210000)
>>> [   23.293678] ---[ end trace 0000000000000000 ]---
>>> [   23.294511] note: khugepaged[30] exited with preempt_count 2
>>>
>>> Looking into file mm/page_table_check.c where this problem occured.
>>>
>>> /*
>>>   * An enty is removed from the page table, decrement the counters for that page
>>>   * verify that it is of correct type and counters do not become negative.
>>>   */
>>> static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
>>>                                     unsigned long pfn, unsigned long pgcnt)
>>> {
>>>          struct page_ext *page_ext;
>>>          struct page *page;
>>>          unsigned long i;
>>>          bool anon;
>>>
>>>          if (!pfn_valid(pfn))
>>>                  return;
>>>
>>>          page = pfn_to_page(pfn);
>>>          page_ext = lookup_page_ext(page);
>>>          anon = PageAnon(page);
>>>
>>>          for (i = 0; i < pgcnt; i++) {
>>>                  struct page_table_check *ptc = get_page_table_check(page_ext);
>>>
>>>                  if (anon) {
>>>                          BUG_ON(atomic_read(&ptc->file_map_count));
>>>                          BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
>>>                  } else {
>>>                          BUG_ON(atomic_read(&ptc->anon_map_count));
>>>   Triggered here ====>>  BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>>>                  }
>>>                  page_ext = page_ext_next(page_ext);
>>>          }
>>> }
>>>
>>> Could you explain what was expected during pmdp_collapse_flush() which when
>>> failed, triggered this BUG_ON() ? This counter seems to be page table check
>>> specific, could it just go wrong ? I have not looked into the details about
>>> page table check mechanism.
>>>
>>> - Anshuman
>>> .
>>
>> Hi Anshuman:
>>
>> Thanks for your job.
>>
>> Let me briefly explain the principle of page table check(PTC).
>>
>> PTC introduces the following struct for page mapping type count:
>> struct page_table_check {
>>          atomic_t anon_map_count;
>>          atomic_t file_map_count;
>> };
>> This structure can be obtained by "lookup_page_ext(page)"
>>
>> When page table entries are set(pud/pmd/pte), page_table_check_set()  is
>> called to increase the page mapping count, Also check for errors (eg:if
>> a page is used for anonymous mapping, then the page cannot be used for
>> file mapping at the same time).
>>
>> When page table entries are clear(pud/pmd/pte), page_table_check_clear()
>>   is called to decrease the page mapping count, Also check for errors.
>>
>> The error check rules are described in the following documents:
>> Documentation/vm/page_table_check.rst
>>
>> The setting and clearing of page table entries are symmetrical.
>>
>> Here __page_table_check_pmd_clear() trigger BUGON which indicates that
>> the pmd entry file mapping count has become negative.
>>
>> I guess if PTC didn't detect this exception, would there have been any
>> problems?
> 
> It is hard to tell what sort of problem has been detected. More
> debugging is needed in order to understand it. A huge file entry is
> being removed from the page table. However, at least one sub page of
> that entry does not have a record that it was added as a file entry to

I guess PMD splitting scenarios should also be taken care as sub pages
will also go via appropriate XXX_set_at() helpers ?

> the page table. At Google we found a few internal security bugs using
> PTCs. However, this being new on ARM64, it is possible that the bug is
> in PTC/khugepaged itself.
> 
> Anshuman is it possible to repro your scenario in QEMU?

I have been unable to reproduce this reported problem. Last time it just
happened after a fresh boot without anything in particular running. Will
continue experimenting.
Tong Tiangen April 19, 2022, 8:52 a.m. UTC | #6
在 2022/4/19 15:10, Anshuman Khandual 写道:
> 
> 
> On 4/18/22 21:17, Tong Tiangen wrote:
>>
>>
>> 在 2022/4/18 17:28, Anshuman Khandual 写道:
>>> On 4/18/22 09:14, Tong Tiangen wrote:
>>>> From: Kefeng Wang <wangkefeng.wang@huawei.com>
[...]
>>>
>>> Could you explain what was expected during pmdp_collapse_flush() which when
>>> failed, triggered this BUG_ON() ? This counter seems to be page table check
>>> specific, could it just go wrong ? I have not looked into the details about
>>> page table check mechanism.
>>>
>>> - Anshuman
>>> .
>>
>> Hi Anshuman:
>>
>> Thanks for your job.
>>
>> Let me briefly explain the principle of page table check(PTC).
>>
>> PTC introduces the following struct for page mapping type count:
>> struct page_table_check {
>>          atomic_t anon_map_count;
>>          atomic_t file_map_count;
>> };
>> This structure can be obtained by "lookup_page_ext(page)"
> 
> 
> Right.
> 
>>
>> When page table entries are set(pud/pmd/pte), page_table_check_set()  is called to increase the page mapping count, Also check for errors (eg:if a page is used for anonymous mapping, then the page cannot be used for file mapping at the same time).
>>
>> When page table entries are clear(pud/pmd/pte), page_table_check_clear()  is called to decrease the page mapping count, Also check for errors.
>>
>> The error check rules are described in the following documents: Documentation/vm/page_table_check.rst
> 
> Snippet from that document.
> 
> +-------------------+-------------------+-------------------+------------------+
> | Current Mapping   | New mapping       | Permissions       | Rule             |
> +===================+===================+===================+==================+
> | Anonymous         | Anonymous         | Read              | Allow            |
> +-------------------+-------------------+-------------------+------------------+
> | Anonymous         | Anonymous         | Read / Write      | Prohibit         |
> +-------------------+-------------------+-------------------+------------------+
> | Anonymous         | Named             | Any               | Prohibit         |
> +-------------------+-------------------+-------------------+------------------+
> | Named             | Anonymous         | Any               | Prohibit         |
> +-------------------+-------------------+-------------------+------------------+
> | Named             | Named             | Any               | Allow            |
> +-------------------+-------------------+-------------------+------------------+
> 
> Does 'Named' refer to file mapping ? Also what does 'Prohibit' imply here ? The
> check will call out a BUG_ON() in such cases ?

Right, Named means file mapping,  Prohibit here trigger BUG_ON.

> 
> page_table_check_clear()
> {
> 
>                  if (anon) {
>                          BUG_ON(atomic_read(&ptc->file_map_count));
>                          BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
>                  } else {
>                          BUG_ON(atomic_read(&ptc->anon_map_count));
>                          BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>                  }
> }
> 
> So in the clear path, there are two checks
> 
> - If the current mapping is Anon, file_map_count cannot be positive and other way
> - Decrement the applicable counter ensuring that it does not turn negative
> 
> page_table_check_set()
> {
>                  if (anon) {
>                          BUG_ON(atomic_read(&ptc->file_map_count));
>                          BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
>                  } else {
>                          BUG_ON(atomic_read(&ptc->anon_map_count));
>                          BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
>                  }
> }
> 
> So in the set path, there are two checks
> 
> - If the current mapping is anon, file_map_count cannot be positive and other way
> - Anon mapping cannot be RW if the page has been mapped more than once
> - But then why check for negative values for file_map_count after increment ?

Check for negative after increment is logically OK and <=0 should be 
more reasonable.

> 
> Is there any other checks, which this test ensures, that I might be missing ?

The following checks are performed when page table entry are 
allocated/released:
__page_table_check_zero()
{
	BUG_ON(atomic_read(&ptc->anon_map_count));
	BUG_ON(atomic_read(&ptc->file_map_count));
}

> 
>>
>> The setting and clearing of page table entries are symmetrical.
> 
> This assumption should be true for any user accessible mapping, for this test to work ?

Right, if not, here is BUG_ON.

However, as Pasha said:
"this being new on ARM64, it is possible that the bug is in 
PTC/khugepaged itself."

> 
> Also why PUD_PAGE_SIZE/PMD_PAGE_SIZE are being used here instead of directly using
> generic macros such as PUD_SIZE/PMD_SIZE ? Is there a specific reason ?

I did code optimization for this, in patch 1/4 of this patchset:

+#ifndef PMD_PAGE_SIZE
+#define PMD_PAGE_SIZE	PMD_SIZE
+#endif
+
+#ifndef PUD_PAGE_SIZE
+#define PUD_PAGE_SIZE	PUD_SIZE
+#endif


Thank you.
Tong.

> 
>>
>> Here __page_table_check_pmd_clear() trigger BUGON which indicates that the pmd entry file mapping count has become negative.
>>
>> I guess if PTC didn't detect this exception, would there have been any problems?
> 
> I am looking into this, not sure for now.
> .
Anshuman Khandual April 19, 2022, 10:22 a.m. UTC | #7
On 4/18/22 09:14, Tong Tiangen wrote:
> +#ifdef CONFIG_PAGE_TABLE_CHECK
> +static inline bool pte_user_accessible_page(pte_t pte)
> +{
> +	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
> +}
> +
> +static inline bool pmd_user_accessible_page(pmd_t pmd)
> +{
> +	return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
> +}
> +
> +static inline bool pud_user_accessible_page(pud_t pud)
> +{
> +	return pud_present(pud) && pud_user(pud);
> +}
> +#endif
Wondering why check for these page table entry states when init_mm
has already being excluded ? Should not user page tables be checked
for in entirety for all updates ? what is the rationale for filtering
out only pxx_user_access_page entries ?
Pasha Tatashin April 19, 2022, 1:19 p.m. UTC | #8
On Tue, Apr 19, 2022 at 6:22 AM Anshuman Khandual
<anshuman.khandual@arm.com> wrote:
>
>
> On 4/18/22 09:14, Tong Tiangen wrote:
> > +#ifdef CONFIG_PAGE_TABLE_CHECK
> > +static inline bool pte_user_accessible_page(pte_t pte)
> > +{
> > +     return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
> > +}
> > +
> > +static inline bool pmd_user_accessible_page(pmd_t pmd)
> > +{
> > +     return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
> > +}
> > +
> > +static inline bool pud_user_accessible_page(pud_t pud)
> > +{
> > +     return pud_present(pud) && pud_user(pud);
> > +}
> > +#endif
> Wondering why check for these page table entry states when init_mm
> has already being excluded ? Should not user page tables be checked
> for in entirety for all updates ? what is the rationale for filtering
> out only pxx_user_access_page entries ?

The point is to prevent false sharing and memory corruption issues.
The idea of PTC to be simple and relatively independent  from the MM
state machine that catches invalid page sharing. I.e. if an R/W anon
page is accessible by user land, that page can never be mapped into
another process (internally shared anons are treated as named
mappings).

Therefore, we try not to rely on MM states, and ensure that when a
page-table entry is accessible by user it meets the required
assumptions: no false sharing, etc.

For example, one bug that was caught with PTC was where a driver on an
unload would put memory on a freelist but memory is still mapped in
user page table.

Pasha
Anshuman Khandual April 20, 2022, 5:05 a.m. UTC | #9
On 4/19/22 18:49, Pasha Tatashin wrote:
> On Tue, Apr 19, 2022 at 6:22 AM Anshuman Khandual
> <anshuman.khandual@arm.com> wrote:
>>
>>
>> On 4/18/22 09:14, Tong Tiangen wrote:
>>> +#ifdef CONFIG_PAGE_TABLE_CHECK
>>> +static inline bool pte_user_accessible_page(pte_t pte)
>>> +{
>>> +     return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
>>> +}
>>> +
>>> +static inline bool pmd_user_accessible_page(pmd_t pmd)
>>> +{
>>> +     return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
>>> +}
>>> +
>>> +static inline bool pud_user_accessible_page(pud_t pud)
>>> +{
>>> +     return pud_present(pud) && pud_user(pud);
>>> +}
>>> +#endif
>> Wondering why check for these page table entry states when init_mm
>> has already being excluded ? Should not user page tables be checked
>> for in entirety for all updates ? what is the rationale for filtering
>> out only pxx_user_access_page entries ?
> 
> The point is to prevent false sharing and memory corruption issues.
> The idea of PTC to be simple and relatively independent  from the MM
> state machine that catches invalid page sharing. I.e. if an R/W anon

Right, this mechanism here is truly interdependent validation, which is
orthogonal to other MM states. Although I was curious, if mm_struct is
not 'init_mm', what percentage of its total page table mapped entries
will be user accessible ? These new helpers only filter out entries that
could potentially create false sharing leading upto memory corruption ?

I am wondering if there is any other way such filtering could have been
applied without adding all these new page table helpers just for page
table check purpose.

> page is accessible by user land, that page can never be mapped into
> another process (internally shared anons are treated as named
> mappings).

Right.

> 
> Therefore, we try not to rely on MM states, and ensure that when a
> page-table entry is accessible by user it meets the required
> assumptions: no false sharing, etc.

Right, filtering reduces the page table entries that needs interception
during update (set/clear), but was just curious is there another way of
doing it, without adding page table check specific helpers on platforms
subscribing PAGE_TABLE_CHECK ?

> 
> For example, one bug that was caught with PTC was where a driver on an
> unload would put memory on a freelist but memory is still mapped in
> user page table.

Should not page's refcount (that it is being used else where) prevented
releases into free list ? But page table check here might just detect
such scenarios even before page gets released.
Pasha Tatashin April 20, 2022, 5:08 p.m. UTC | #10
On Wed, Apr 20, 2022 at 1:05 AM Anshuman Khandual
<anshuman.khandual@arm.com> wrote:
>
>
>
> On 4/19/22 18:49, Pasha Tatashin wrote:
> > On Tue, Apr 19, 2022 at 6:22 AM Anshuman Khandual
> > <anshuman.khandual@arm.com> wrote:
> >>
> >>
> >> On 4/18/22 09:14, Tong Tiangen wrote:
> >>> +#ifdef CONFIG_PAGE_TABLE_CHECK
> >>> +static inline bool pte_user_accessible_page(pte_t pte)
> >>> +{
> >>> +     return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
> >>> +}
> >>> +
> >>> +static inline bool pmd_user_accessible_page(pmd_t pmd)
> >>> +{
> >>> +     return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
> >>> +}
> >>> +
> >>> +static inline bool pud_user_accessible_page(pud_t pud)
> >>> +{
> >>> +     return pud_present(pud) && pud_user(pud);
> >>> +}
> >>> +#endif
> >> Wondering why check for these page table entry states when init_mm
> >> has already being excluded ? Should not user page tables be checked
> >> for in entirety for all updates ? what is the rationale for filtering
> >> out only pxx_user_access_page entries ?
> >
> > The point is to prevent false sharing and memory corruption issues.
> > The idea of PTC to be simple and relatively independent  from the MM
> > state machine that catches invalid page sharing. I.e. if an R/W anon
>
> Right, this mechanism here is truly interdependent validation, which is
> orthogonal to other MM states. Although I was curious, if mm_struct is
> not 'init_mm', what percentage of its total page table mapped entries
> will be user accessible ? These new helpers only filter out entries that
> could potentially create false sharing leading upto memory corruption ?

Yes, the intention is to filter out the false sharing scenarios.
Allows crashing the system prior to memory corruption or memory
leaking.

>
> I am wondering if there is any other way such filtering could have been
> applied without adding all these new page table helpers just for page
> table check purpose.
>
> > page is accessible by user land, that page can never be mapped into
> > another process (internally shared anons are treated as named
> > mappings).
>
> Right.
>
> >
> > Therefore, we try not to rely on MM states, and ensure that when a
> > page-table entry is accessible by user it meets the required
> > assumptions: no false sharing, etc.
>
> Right, filtering reduces the page table entries that needs interception
> during update (set/clear), but was just curious is there another way of
> doing it, without adding page table check specific helpers on platforms
> subscribing PAGE_TABLE_CHECK ?
>

It makes sense to limit the scope of PTC only to user accessible
pages, and not try to catch other bugs. This keeps it reasonably
small, and also lowers runtime overhead so it can be used in
production as well. IMO the extra helpers are not very intrusive, and
generic enough that in the future might be used elsewhere as well.


> >
> > For example, one bug that was caught with PTC was where a driver on an
> > unload would put memory on a freelist but memory is still mapped in
> > user page table.
>
> Should not page's refcount (that it is being used else where) prevented
> releases into free list ? But page table check here might just detect
> such scenarios even before page gets released.

Usually yes. However, there are a number of recent bugs related to
refcount [1][2][3]. This is why we need a stronger checker.

The particular bug, however, did not rely on refcount. The driver
allocated a kernel page for a ringbuffer, upon request shared it with
a userspace by mapping it into the user address space, and later when
the driver was unloaded, it never removed the mapping from the user
address space. Thus, even though the page was freed when the driver
was unloaded, the mapping stayed in the user page table.

[1] https://lore.kernel.org/all/xr9335nxwc5y.fsf@gthelen2.svl.corp.google.com
[2] https://lore.kernel.org/all/1582661774-30925-2-git-send-email-akaher@vmware.com
[3] https://lore.kernel.org/all/20210622021423.154662-3-mike.kravetz@oracle.com
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e80fd2372f02..7114d2d5155e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -92,6 +92,7 @@  config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 930077f7b572..9f8f97a7cc7c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -33,6 +33,7 @@ 
 #include <linux/mmdebug.h>
 #include <linux/mm_types.h>
 #include <linux/sched.h>
+#include <linux/page_table_check.h>
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
@@ -96,6 +97,7 @@  static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 #define pte_young(pte)		(!!(pte_val(pte) & PTE_AF))
 #define pte_special(pte)	(!!(pte_val(pte) & PTE_SPECIAL))
 #define pte_write(pte)		(!!(pte_val(pte) & PTE_WRITE))
+#define pte_user(pte)		(!!(pte_val(pte) & PTE_USER))
 #define pte_user_exec(pte)	(!(pte_val(pte) & PTE_UXN))
 #define pte_cont(pte)		(!!(pte_val(pte) & PTE_CONT))
 #define pte_devmap(pte)		(!!(pte_val(pte) & PTE_DEVMAP))
@@ -312,7 +314,7 @@  static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
 		     __func__, pte_val(old_pte), pte_val(pte));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
 	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
@@ -343,6 +345,13 @@  static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 	set_pte(ptep, pte);
 }
 
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep, pte_t pte)
+{
+	page_table_check_pte_set(mm, addr, ptep, pte);
+	return __set_pte_at(mm, addr, ptep, pte);
+}
+
 /*
  * Huge pte definitions.
  */
@@ -454,6 +463,8 @@  static inline int pmd_trans_huge(pmd_t pmd)
 #define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
 #define pmd_valid(pmd)		pte_valid(pmd_pte(pmd))
+#define pmd_user(pmd)		pte_user(pmd_pte(pmd))
+#define pmd_user_exec(pmd)	pte_user_exec(pmd_pte(pmd))
 #define pmd_cont(pmd)		pte_cont(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
@@ -501,8 +512,19 @@  static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
 #define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
-#define set_pmd_at(mm, addr, pmdp, pmd)	set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
-#define set_pud_at(mm, addr, pudp, pud)	set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud))
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+			      pmd_t *pmdp, pmd_t pmd)
+{
+	page_table_check_pmd_set(mm, addr, pmdp, pmd);
+	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
+}
+
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	page_table_check_pud_set(mm, addr, pudp, pud);
+	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
+}
 
 #define __p4d_to_phys(p4d)	__pte_to_phys(p4d_pte(p4d))
 #define __phys_to_p4d_val(phys)	__phys_to_pte_val(phys)
@@ -643,6 +665,24 @@  static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #define pud_leaf(pud)		pud_sect(pud)
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
+#define pud_user(pud)		pte_user(pud_pte(pud))
+
+#ifdef CONFIG_PAGE_TABLE_CHECK
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
+}
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+	return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+	return pud_present(pud) && pud_user(pud);
+}
+#endif
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
@@ -872,11 +912,21 @@  static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long address, pte_t *ptep)
+{
+	return __pte(xchg_relaxed(&pte_val(*ptep), 0));
+}
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address, pte_t *ptep)
 {
-	return __pte(xchg_relaxed(&pte_val(*ptep), 0));
+	pte_t pte = __ptep_get_and_clear(mm, address, ptep);
+
+	page_table_check_pte_clear(mm, address, pte);
+
+	return pte;
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -884,7 +934,11 @@  static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address, pmd_t *pmdp)
 {
-	return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
+	pmd_t pmd = pte_pmd(__ptep_get_and_clear(mm, address, (pte_t *)pmdp));
+
+	page_table_check_pmd_clear(mm, address, pmd);
+
+	return pmd;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -918,6 +972,7 @@  static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmdp, pmd_t pmd)
 {
+	page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
 	return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
 }
 #endif