diff mbox series

[v2] fs/proc: task_mmu.c: don't read mapcount for migration entry

Message ID 20220120202805.3369-1-shy828301@gmail.com (mailing list archive)
State New
Headers show
Series [v2] fs/proc: task_mmu.c: don't read mapcount for migration entry | expand

Commit Message

Yang Shi Jan. 20, 2022, 8:28 p.m. UTC
The syzbot reported the below BUG:

kernel BUG at include/linux/page-flags.h:785!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 page_mapcount include/linux/mm.h:837 [inline]
 smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
 smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
 smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
 walk_pmd_range mm/pagewalk.c:128 [inline]
 walk_pud_range mm/pagewalk.c:205 [inline]
 walk_p4d_range mm/pagewalk.c:240 [inline]
 walk_pgd_range mm/pagewalk.c:277 [inline]
 __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
 walk_page_vma+0x277/0x350 mm/pagewalk.c:530
 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
 smap_gather_stats fs/proc/task_mmu.c:741 [inline]
 show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
 seq_read+0x3e0/0x5b0 fs/seq_file.c:162
 vfs_read+0x1b5/0x600 fs/read_write.c:479
 ksys_read+0x12d/0x250 fs/read_write.c:619
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7faa2af6c969
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
 </TASK>
Modules linked in:
---[ end trace 24ec93ff95e4ac3d ]---
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

The reproducer was trying to reading /proc/$PID/smaps when calling
MADV_FREE at the mean time.  MADV_FREE may split THPs if it is called
for partial THP.  It may trigger the below race:

         CPU A                         CPU B
         -----                         -----
smaps walk:                      MADV_FREE:
page_mapcount()
  PageCompound()
                                 split_huge_page()
  page = compound_head(page)
  PageDoubleMap(page)

When calling PageDoubleMap() this page is not a tail page of THP anymore
so the BUG is triggered.

This could be fixed by elevated refcount of the page before calling
mapcount, but it prevents from counting migration entries, and it seems
overkilling because the race just could happen when PMD is split so all
PTE entries of tail pages are actually migration entries, and
smaps_account() does treat migration entries as mapcount == 1 as Kirill
pointed out.

Add a new parameter for smaps_account() to tell this entry is migration
entry then skip calling page_mapcount().  Don't skip getting mapcount for
device private entries since they do track references with mapcount.

Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries")
Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Yang Shi <shy828301@gmail.com>
---
v2: * Added proper fix tag per Jann Horn
    * Rebased to the latest linus's tree

 fs/proc/task_mmu.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

Comments

Yang Shi Jan. 26, 2022, 1:59 a.m. UTC | #1
On Thu, Jan 20, 2022 at 12:28 PM Yang Shi <shy828301@gmail.com> wrote:
>
> The syzbot reported the below BUG:
>
> kernel BUG at include/linux/page-flags.h:785!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  <TASK>
>  page_mapcount include/linux/mm.h:837 [inline]
>  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
>  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
>  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
>  walk_pmd_range mm/pagewalk.c:128 [inline]
>  walk_pud_range mm/pagewalk.c:205 [inline]
>  walk_p4d_range mm/pagewalk.c:240 [inline]
>  walk_pgd_range mm/pagewalk.c:277 [inline]
>  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
>  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
>  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
>  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
>  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
>  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
>  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
>  vfs_read+0x1b5/0x600 fs/read_write.c:479
>  ksys_read+0x12d/0x250 fs/read_write.c:619
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7faa2af6c969
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
> RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
> RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
> R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
> R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
>  </TASK>
> Modules linked in:
> ---[ end trace 24ec93ff95e4ac3d ]---
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>
> The reproducer was trying to reading /proc/$PID/smaps when calling
> MADV_FREE at the mean time.  MADV_FREE may split THPs if it is called
> for partial THP.  It may trigger the below race:
>
>          CPU A                         CPU B
>          -----                         -----
> smaps walk:                      MADV_FREE:
> page_mapcount()
>   PageCompound()
>                                  split_huge_page()
>   page = compound_head(page)
>   PageDoubleMap(page)
>
> When calling PageDoubleMap() this page is not a tail page of THP anymore
> so the BUG is triggered.
>
> This could be fixed by elevated refcount of the page before calling
> mapcount, but it prevents from counting migration entries, and it seems
> overkilling because the race just could happen when PMD is split so all
> PTE entries of tail pages are actually migration entries, and
> smaps_account() does treat migration entries as mapcount == 1 as Kirill
> pointed out.
>
> Add a new parameter for smaps_account() to tell this entry is migration
> entry then skip calling page_mapcount().  Don't skip getting mapcount for
> device private entries since they do track references with mapcount.
>
> Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries")

I just thought that this commit should not incur this race condition
since THP split didn't use migration entry at that time at all. Using
migration entry for THP split was introduced during v4.5 release by
commit ba98828088ad ("thp: add option to setup migration entries
during PMD split") and e9b61f19858a ("thp: reintroduce
split_huge_page()"). And PageDoubleMap was introduced by commit
53f9263baba6 ("mm: rework mapcount accounting to enable 4k mapping of
THPs") right before those two commits.
The commit e9b61f19858a ("thp: reintroduce split_huge_page()")
converted THP split to use migration entry finally. So this race
condition seems possible after it.

The proper fixes tag should be:
Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()").

Andrew, could you please update this in -mm tree? Thanks.

> Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Cc: Jann Horn <jannh@google.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
> v2: * Added proper fix tag per Jann Horn
>     * Rebased to the latest linus's tree
>
>  fs/proc/task_mmu.c | 27 +++++++++++++++++++++------
>  1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 18f8c3acbb85..2bb567014d77 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
>  }
>
>  static void smaps_account(struct mem_size_stats *mss, struct page *page,
> -               bool compound, bool young, bool dirty, bool locked)
> +               bool compound, bool young, bool dirty, bool locked,
> +               bool migration)
>  {
>         int i, nr = compound ? compound_nr(page) : 1;
>         unsigned long size = nr * PAGE_SIZE;
> @@ -467,8 +468,12 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>          * page_count(page) == 1 guarantees the page is mapped exactly once.
>          * If any subpage of the compound page mapped with PTE it would elevate
>          * page_count().
> +        *
> +        * Treated regular migration entries as mapcount == 1 without reading
> +        * mapcount since calling page_mapcount() for migration entries is
> +        * racy against THP splitting.
>          */
> -       if (page_count(page) == 1) {
> +       if ((page_count(page) == 1) || migration) {
>                 smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
>                         locked, true);
>                 return;
> @@ -517,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>         struct vm_area_struct *vma = walk->vma;
>         bool locked = !!(vma->vm_flags & VM_LOCKED);
>         struct page *page = NULL;
> +       bool migration = false;
>
>         if (pte_present(*pte)) {
>                 page = vm_normal_page(vma, addr, *pte);
> @@ -536,8 +542,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>                         } else {
>                                 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
>                         }
> -               } else if (is_pfn_swap_entry(swpent))
> +               } else if (is_pfn_swap_entry(swpent)) {
> +                       if (is_migration_entry(swpent))
> +                               migration = true;
>                         page = pfn_swap_entry_to_page(swpent);
> +               }
>         } else {
>                 smaps_pte_hole_lookup(addr, walk);
>                 return;
> @@ -546,7 +555,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>         if (!page)
>                 return;
>
> -       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
> +       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
> +                     locked, migration);
>  }
>
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -557,6 +567,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>         struct vm_area_struct *vma = walk->vma;
>         bool locked = !!(vma->vm_flags & VM_LOCKED);
>         struct page *page = NULL;
> +       bool migration = false;
>
>         if (pmd_present(*pmd)) {
>                 /* FOLL_DUMP will return -EFAULT on huge zero page */
> @@ -564,8 +575,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>         } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
>                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
>
> -               if (is_migration_entry(entry))
> +               if (is_migration_entry(entry)) {
> +                       migration = true;
>                         page = pfn_swap_entry_to_page(entry);
> +               }
>         }
>         if (IS_ERR_OR_NULL(page))
>                 return;
> @@ -577,7 +590,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>                 /* pass */;
>         else
>                 mss->file_thp += HPAGE_PMD_SIZE;
> -       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
> +
> +       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
> +                     locked, migration);
>  }
>  #else
>  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
> --
> 2.26.3
>
Andrew Morton Jan. 26, 2022, 2:08 a.m. UTC | #2
On Tue, 25 Jan 2022 17:59:48 -0800 Yang Shi <shy828301@gmail.com> wrote:

> The proper fixes tag should be:
> Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()").
> 
> Andrew, could you please update this in -mm tree? Thanks.

Done, thanks.
David Hildenbrand Jan. 26, 2022, 10:51 a.m. UTC | #3
On 20.01.22 21:28, Yang Shi wrote:
> The syzbot reported the below BUG:
> 
> kernel BUG at include/linux/page-flags.h:785!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  <TASK>
>  page_mapcount include/linux/mm.h:837 [inline]
>  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
>  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
>  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
>  walk_pmd_range mm/pagewalk.c:128 [inline]
>  walk_pud_range mm/pagewalk.c:205 [inline]
>  walk_p4d_range mm/pagewalk.c:240 [inline]
>  walk_pgd_range mm/pagewalk.c:277 [inline]
>  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
>  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
>  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
>  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
>  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
>  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
>  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
>  vfs_read+0x1b5/0x600 fs/read_write.c:479
>  ksys_read+0x12d/0x250 fs/read_write.c:619
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7faa2af6c969
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
> RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
> RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
> R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
> R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
>  </TASK>
> Modules linked in:
> ---[ end trace 24ec93ff95e4ac3d ]---
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> 

Does this point at the bigger issue that reading the mapcount without
having the page locked is completely unstable?
Jann Horn Jan. 26, 2022, 11:29 a.m. UTC | #4
On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
> On 20.01.22 21:28, Yang Shi wrote:
> > The syzbot reported the below BUG:
> >
> > kernel BUG at include/linux/page-flags.h:785!
> > invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> > CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> > RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> > RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> > Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> > RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> > RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> > RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> > RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> > R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> > R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> > FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > Call Trace:
> >  <TASK>
> >  page_mapcount include/linux/mm.h:837 [inline]
> >  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
> >  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
> >  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
> >  walk_pmd_range mm/pagewalk.c:128 [inline]
> >  walk_pud_range mm/pagewalk.c:205 [inline]
> >  walk_p4d_range mm/pagewalk.c:240 [inline]
> >  walk_pgd_range mm/pagewalk.c:277 [inline]
> >  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
> >  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
> >  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
> >  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
> >  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
> >  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
> >  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
> >  vfs_read+0x1b5/0x600 fs/read_write.c:479
> >  ksys_read+0x12d/0x250 fs/read_write.c:619
> >  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
> >  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
> >  entry_SYSCALL_64_after_hwframe+0x44/0xae
> > RIP: 0033:0x7faa2af6c969
> > Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> > RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> > RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
> > RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
> > RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
> > R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
> > R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
> >  </TASK>
> > Modules linked in:
> > ---[ end trace 24ec93ff95e4ac3d ]---
> > RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> > RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> > Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> > RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> > RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> > RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> > RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> > R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> > R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> > FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >
>
> Does this point at the bigger issue that reading the mapcount without
> having the page locked is completely unstable?

(See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
for context.)

I'm not sure what you mean by "unstable". Do you mean "the result is
not guaranteed to still be valid when the call returns", "the result
might not have ever been valid", or "the call might crash because the
page's state as a compound page is unstable"?

In case you mean "the result is not guaranteed to still be valid when
the call returns":
We're just collecting stats for userspace, and by the time we return
to userspace, the numbers will be outdated anyway, so that doesn't
matter.

In case you mean "the result might not have ever been valid":
Yes, even with this patch applied, in theory concurrent THP splits
could cause us to count some page mappings twice. Arguably that's not
entirely correct.

In case you mean "the call might crash because the page's state as a
compound page could concurrently change":
As long as we have our own mapping of the page, the page can't be
split, so this patch fixes that problem.
David Hildenbrand Jan. 26, 2022, 11:38 a.m. UTC | #5
On 26.01.22 12:29, Jann Horn wrote:
> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
>> On 20.01.22 21:28, Yang Shi wrote:
>>> The syzbot reported the below BUG:
>>>
>>> kernel BUG at include/linux/page-flags.h:785!
>>> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
>>> CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
>>> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
>>> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
>>> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
>>> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
>>> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
>>> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
>>> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
>>> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
>>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>> Call Trace:
>>>  <TASK>
>>>  page_mapcount include/linux/mm.h:837 [inline]
>>>  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
>>>  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
>>>  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
>>>  walk_pmd_range mm/pagewalk.c:128 [inline]
>>>  walk_pud_range mm/pagewalk.c:205 [inline]
>>>  walk_p4d_range mm/pagewalk.c:240 [inline]
>>>  walk_pgd_range mm/pagewalk.c:277 [inline]
>>>  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
>>>  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
>>>  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
>>>  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
>>>  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
>>>  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
>>>  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
>>>  vfs_read+0x1b5/0x600 fs/read_write.c:479
>>>  ksys_read+0x12d/0x250 fs/read_write.c:619
>>>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>>>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>>>  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>> RIP: 0033:0x7faa2af6c969
>>> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
>>> RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
>>> RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
>>> RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
>>> RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
>>> R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
>>> R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
>>>  </TASK>
>>> Modules linked in:
>>> ---[ end trace 24ec93ff95e4ac3d ]---
>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
>>> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
>>> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
>>> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
>>> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
>>> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
>>> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
>>> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
>>> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
>>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>
>>
>> Does this point at the bigger issue that reading the mapcount without
>> having the page locked is completely unstable?
> 
> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
> for context.)

Thanks for the pointer.

> 
> I'm not sure what you mean by "unstable". Do you mean "the result is
> not guaranteed to still be valid when the call returns", "the result
> might not have ever been valid", or "the call might crash because the
> page's state as a compound page is unstable"?

A little bit of everything :)

> 
> In case you mean "the result is not guaranteed to still be valid when
> the call returns":
> We're just collecting stats for userspace, and by the time we return
> to userspace, the numbers will be outdated anyway, so that doesn't
> matter.

Yes, it's a snapshot.

> 
> In case you mean "the result might not have ever been valid":
> Yes, even with this patch applied, in theory concurrent THP splits
> could cause us to count some page mappings twice. Arguably that's not
> entirely correct.

Yes, the snapshot is not atomic and, thereby, unreliable. That what I
mostly meant as "unstable".

> 
> In case you mean "the call might crash because the page's state as a
> compound page could concurrently change":

I think that's just a side-product of the snapshot not being "correct",
right?

> As long as we have our own mapping of the page, the page can't be
> split, so this patch fixes that problem.
Jann Horn Jan. 26, 2022, 11:48 a.m. UTC | #6
On Wed, Jan 26, 2022 at 12:38 PM David Hildenbrand <david@redhat.com> wrote:
> On 26.01.22 12:29, Jann Horn wrote:
> > On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
> >> On 20.01.22 21:28, Yang Shi wrote:
> >>> The syzbot reported the below BUG:
> >>>
> >>> kernel BUG at include/linux/page-flags.h:785!
[...]
> >>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> >>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
[...]
> >> Does this point at the bigger issue that reading the mapcount without
> >> having the page locked is completely unstable?
> >
> > (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
> > for context.)
>
> Thanks for the pointer.
>
> >
> > I'm not sure what you mean by "unstable". Do you mean "the result is
> > not guaranteed to still be valid when the call returns", "the result
> > might not have ever been valid", or "the call might crash because the
> > page's state as a compound page is unstable"?
>
> A little bit of everything :)
[...]
> > In case you mean "the result might not have ever been valid":
> > Yes, even with this patch applied, in theory concurrent THP splits
> > could cause us to count some page mappings twice. Arguably that's not
> > entirely correct.
>
> Yes, the snapshot is not atomic and, thereby, unreliable. That what I
> mostly meant as "unstable".
>
> >
> > In case you mean "the call might crash because the page's state as a
> > compound page could concurrently change":
>
> I think that's just a side-product of the snapshot not being "correct",
> right?

I guess you could see it that way? The way I look at it is that
page_mapcount() is designed to return a number that's at least as high
as the number of mappings (rarely higher due to races), and using
page_mapcount() on an unlocked page is legitimate if you're fine with
the rare double-counting of references. In my view, the problem here
is:

There are different types of references to "struct page" - some of
them allow you to call page_mapcount(), some don't. And in particular,
get_page() doesn't give you a reference that can be used with
page_mapcount(), but locking a (real, non-migration) PTE pointing to
the page does give you such a reference.

This concept of "different types of references" is the same as you
e.g. get with mmgrab() vs mmget() - they both give references to the
same object, but those references have different usage restrictions.
David Hildenbrand Jan. 26, 2022, 11:57 a.m. UTC | #7
On 26.01.22 12:48, Jann Horn wrote:
> On Wed, Jan 26, 2022 at 12:38 PM David Hildenbrand <david@redhat.com> wrote:
>> On 26.01.22 12:29, Jann Horn wrote:
>>> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>> On 20.01.22 21:28, Yang Shi wrote:
>>>>> The syzbot reported the below BUG:
>>>>>
>>>>> kernel BUG at include/linux/page-flags.h:785!
> [...]
>>>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>>>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> [...]
>>>> Does this point at the bigger issue that reading the mapcount without
>>>> having the page locked is completely unstable?
>>>
>>> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
>>> for context.)
>>
>> Thanks for the pointer.
>>
>>>
>>> I'm not sure what you mean by "unstable". Do you mean "the result is
>>> not guaranteed to still be valid when the call returns", "the result
>>> might not have ever been valid", or "the call might crash because the
>>> page's state as a compound page is unstable"?
>>
>> A little bit of everything :)
> [...]
>>> In case you mean "the result might not have ever been valid":
>>> Yes, even with this patch applied, in theory concurrent THP splits
>>> could cause us to count some page mappings twice. Arguably that's not
>>> entirely correct.
>>
>> Yes, the snapshot is not atomic and, thereby, unreliable. That what I
>> mostly meant as "unstable".
>>
>>>
>>> In case you mean "the call might crash because the page's state as a
>>> compound page could concurrently change":
>>
>> I think that's just a side-product of the snapshot not being "correct",
>> right?
> 
> I guess you could see it that way? The way I look at it is that
> page_mapcount() is designed to return a number that's at least as high
> as the number of mappings (rarely higher due to races), and using
> page_mapcount() on an unlocked page is legitimate if you're fine with
> the rare double-counting of references. In my view, the problem here
> is:
> 
> There are different types of references to "struct page" - some of
> them allow you to call page_mapcount(), some don't. And in particular,
> get_page() doesn't give you a reference that can be used with
> page_mapcount(), but locking a (real, non-migration) PTE pointing to
> the page does give you such a reference.

I assume the point is that as long as the page cannot be unmapped
because you block it from getting unmapped (PT lock), the compound page
cannot get split. As long as the page cannot get unmapped from that page
table you should have at least a mapcount of 1.

But yeah, using the mapcount of a page that is not even mapped
(migration entry) is clearly wrong.

To summarize: reading the mapcount on an unlocked page will easily
return a wrong result and the result should not be relied upon. reading
the mapcount of a migration entry is dangerous and certainly wrong.
Yang Shi Jan. 26, 2022, 4:53 p.m. UTC | #8
On Wed, Jan 26, 2022 at 3:57 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 26.01.22 12:48, Jann Horn wrote:
> > On Wed, Jan 26, 2022 at 12:38 PM David Hildenbrand <david@redhat.com> wrote:
> >> On 26.01.22 12:29, Jann Horn wrote:
> >>> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
> >>>> On 20.01.22 21:28, Yang Shi wrote:
> >>>>> The syzbot reported the below BUG:
> >>>>>
> >>>>> kernel BUG at include/linux/page-flags.h:785!
> > [...]
> >>>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> >>>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> > [...]
> >>>> Does this point at the bigger issue that reading the mapcount without
> >>>> having the page locked is completely unstable?
> >>>
> >>> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
> >>> for context.)
> >>
> >> Thanks for the pointer.
> >>
> >>>
> >>> I'm not sure what you mean by "unstable". Do you mean "the result is
> >>> not guaranteed to still be valid when the call returns", "the result
> >>> might not have ever been valid", or "the call might crash because the
> >>> page's state as a compound page is unstable"?
> >>
> >> A little bit of everything :)
> > [...]
> >>> In case you mean "the result might not have ever been valid":
> >>> Yes, even with this patch applied, in theory concurrent THP splits
> >>> could cause us to count some page mappings twice. Arguably that's not
> >>> entirely correct.
> >>
> >> Yes, the snapshot is not atomic and, thereby, unreliable. That what I
> >> mostly meant as "unstable".
> >>
> >>>
> >>> In case you mean "the call might crash because the page's state as a
> >>> compound page could concurrently change":
> >>
> >> I think that's just a side-product of the snapshot not being "correct",
> >> right?
> >
> > I guess you could see it that way? The way I look at it is that
> > page_mapcount() is designed to return a number that's at least as high
> > as the number of mappings (rarely higher due to races), and using
> > page_mapcount() on an unlocked page is legitimate if you're fine with
> > the rare double-counting of references. In my view, the problem here
> > is:
> >
> > There are different types of references to "struct page" - some of
> > them allow you to call page_mapcount(), some don't. And in particular,
> > get_page() doesn't give you a reference that can be used with
> > page_mapcount(), but locking a (real, non-migration) PTE pointing to
> > the page does give you such a reference.
>
> I assume the point is that as long as the page cannot be unmapped
> because you block it from getting unmapped (PT lock), the compound page
> cannot get split. As long as the page cannot get unmapped from that page
> table you should have at least a mapcount of 1.

If you mean holding ptl could prevent THP from splitting, then it is
not true since you may be in the middle of THP split just exactly like
the race condition solved by this patch.

Just page lock or elevated page refcount could serialize against THP
split AFAIK.

>
> But yeah, using the mapcount of a page that is not even mapped
> (migration entry) is clearly wrong.
>
> To summarize: reading the mapcount on an unlocked page will easily
> return a wrong result and the result should not be relied upon. reading
> the mapcount of a migration entry is dangerous and certainly wrong.

Depends on your usecase. Some just want to get a snapshot, just like
smaps, they don't care.

>
> --
> Thanks,
>
> David / dhildenb
>
David Hildenbrand Jan. 26, 2022, 4:58 p.m. UTC | #9
On 26.01.22 17:53, Yang Shi wrote:
> On Wed, Jan 26, 2022 at 3:57 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 26.01.22 12:48, Jann Horn wrote:
>>> On Wed, Jan 26, 2022 at 12:38 PM David Hildenbrand <david@redhat.com> wrote:
>>>> On 26.01.22 12:29, Jann Horn wrote:
>>>>> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>>>> On 20.01.22 21:28, Yang Shi wrote:
>>>>>>> The syzbot reported the below BUG:
>>>>>>>
>>>>>>> kernel BUG at include/linux/page-flags.h:785!
>>> [...]
>>>>>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>>>>>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
>>> [...]
>>>>>> Does this point at the bigger issue that reading the mapcount without
>>>>>> having the page locked is completely unstable?
>>>>>
>>>>> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
>>>>> for context.)
>>>>
>>>> Thanks for the pointer.
>>>>
>>>>>
>>>>> I'm not sure what you mean by "unstable". Do you mean "the result is
>>>>> not guaranteed to still be valid when the call returns", "the result
>>>>> might not have ever been valid", or "the call might crash because the
>>>>> page's state as a compound page is unstable"?
>>>>
>>>> A little bit of everything :)
>>> [...]
>>>>> In case you mean "the result might not have ever been valid":
>>>>> Yes, even with this patch applied, in theory concurrent THP splits
>>>>> could cause us to count some page mappings twice. Arguably that's not
>>>>> entirely correct.
>>>>
>>>> Yes, the snapshot is not atomic and, thereby, unreliable. That what I
>>>> mostly meant as "unstable".
>>>>
>>>>>
>>>>> In case you mean "the call might crash because the page's state as a
>>>>> compound page could concurrently change":
>>>>
>>>> I think that's just a side-product of the snapshot not being "correct",
>>>> right?
>>>
>>> I guess you could see it that way? The way I look at it is that
>>> page_mapcount() is designed to return a number that's at least as high
>>> as the number of mappings (rarely higher due to races), and using
>>> page_mapcount() on an unlocked page is legitimate if you're fine with
>>> the rare double-counting of references. In my view, the problem here
>>> is:
>>>
>>> There are different types of references to "struct page" - some of
>>> them allow you to call page_mapcount(), some don't. And in particular,
>>> get_page() doesn't give you a reference that can be used with
>>> page_mapcount(), but locking a (real, non-migration) PTE pointing to
>>> the page does give you such a reference.
>>
>> I assume the point is that as long as the page cannot be unmapped
>> because you block it from getting unmapped (PT lock), the compound page
>> cannot get split. As long as the page cannot get unmapped from that page
>> table you should have at least a mapcount of 1.
> 
> If you mean holding ptl could prevent THP from splitting, then it is
> not true since you may be in the middle of THP split just exactly like
> the race condition solved by this patch.

While you hold the PT lock and discover a mapped page, unmap_page()
cannot continue and unmap the page. That's what I meant "as long as the
page cannot be unmapped".

What doesn't work is if you hold the PT lock and discover a migration
entry, because then you're already past unmap_page(). That's the issue
you're fixing.

> 
> Just page lock or elevated page refcount could serialize against THP
> split AFAIK.
> 
>>
>> But yeah, using the mapcount of a page that is not even mapped
>> (migration entry) is clearly wrong.
>>
>> To summarize: reading the mapcount on an unlocked page will easily
>> return a wrong result and the result should not be relied upon. reading
>> the mapcount of a migration entry is dangerous and certainly wrong.
> 
> Depends on your usecase. Some just want to get a snapshot, just like
> smaps, they don't care.

Right, but as discussed, even the snapshot might be slightly wrong. That
might be just fine for smaps (and I would have enjoyed a comment in the
code stating that :) ).
Yang Shi Jan. 26, 2022, 6:43 p.m. UTC | #10
On Wed, Jan 26, 2022 at 8:58 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 26.01.22 17:53, Yang Shi wrote:
> > On Wed, Jan 26, 2022 at 3:57 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 26.01.22 12:48, Jann Horn wrote:
> >>> On Wed, Jan 26, 2022 at 12:38 PM David Hildenbrand <david@redhat.com> wrote:
> >>>> On 26.01.22 12:29, Jann Horn wrote:
> >>>>> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>>> On 20.01.22 21:28, Yang Shi wrote:
> >>>>>>> The syzbot reported the below BUG:
> >>>>>>>
> >>>>>>> kernel BUG at include/linux/page-flags.h:785!
> >>> [...]
> >>>>>>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> >>>>>>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> >>> [...]
> >>>>>> Does this point at the bigger issue that reading the mapcount without
> >>>>>> having the page locked is completely unstable?
> >>>>>
> >>>>> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
> >>>>> for context.)
> >>>>
> >>>> Thanks for the pointer.
> >>>>
> >>>>>
> >>>>> I'm not sure what you mean by "unstable". Do you mean "the result is
> >>>>> not guaranteed to still be valid when the call returns", "the result
> >>>>> might not have ever been valid", or "the call might crash because the
> >>>>> page's state as a compound page is unstable"?
> >>>>
> >>>> A little bit of everything :)
> >>> [...]
> >>>>> In case you mean "the result might not have ever been valid":
> >>>>> Yes, even with this patch applied, in theory concurrent THP splits
> >>>>> could cause us to count some page mappings twice. Arguably that's not
> >>>>> entirely correct.
> >>>>
> >>>> Yes, the snapshot is not atomic and, thereby, unreliable. That what I
> >>>> mostly meant as "unstable".
> >>>>
> >>>>>
> >>>>> In case you mean "the call might crash because the page's state as a
> >>>>> compound page could concurrently change":
> >>>>
> >>>> I think that's just a side-product of the snapshot not being "correct",
> >>>> right?
> >>>
> >>> I guess you could see it that way? The way I look at it is that
> >>> page_mapcount() is designed to return a number that's at least as high
> >>> as the number of mappings (rarely higher due to races), and using
> >>> page_mapcount() on an unlocked page is legitimate if you're fine with
> >>> the rare double-counting of references. In my view, the problem here
> >>> is:
> >>>
> >>> There are different types of references to "struct page" - some of
> >>> them allow you to call page_mapcount(), some don't. And in particular,
> >>> get_page() doesn't give you a reference that can be used with
> >>> page_mapcount(), but locking a (real, non-migration) PTE pointing to
> >>> the page does give you such a reference.
> >>
> >> I assume the point is that as long as the page cannot be unmapped
> >> because you block it from getting unmapped (PT lock), the compound page
> >> cannot get split. As long as the page cannot get unmapped from that page
> >> table you should have at least a mapcount of 1.
> >
> > If you mean holding ptl could prevent THP from splitting, then it is
> > not true since you may be in the middle of THP split just exactly like
> > the race condition solved by this patch.
>
> While you hold the PT lock and discover a mapped page, unmap_page()
> cannot continue and unmap the page. That's what I meant "as long as the
> page cannot be unmapped".
>
> What doesn't work is if you hold the PT lock and discover a migration
> entry, because then you're already past unmap_page(). That's the issue
> you're fixing.

Yeah, it means you lose the race :-(

>
> >
> > Just page lock or elevated page refcount could serialize against THP
> > split AFAIK.
> >
> >>
> >> But yeah, using the mapcount of a page that is not even mapped
> >> (migration entry) is clearly wrong.
> >>
> >> To summarize: reading the mapcount on an unlocked page will easily
> >> return a wrong result and the result should not be relied upon. reading
> >> the mapcount of a migration entry is dangerous and certainly wrong.
> >
> > Depends on your usecase. Some just want to get a snapshot, just like
> > smaps, they don't care.
>
> Right, but as discussed, even the snapshot might be slightly wrong. That
> might be just fine for smaps (and I would have enjoyed a comment in the
> code stating that :) ).

I think that is documented already, see Documentation/filesystems/proc.rst:

Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
output can be achieved only in the single read call).

Of course, if the extra note is preferred in the code, I could try to
add some in a separate patch.

>
>
> --
> Thanks,
>
> David / dhildenb
>
David Hildenbrand Jan. 26, 2022, 6:54 p.m. UTC | #11
>>> Just page lock or elevated page refcount could serialize against THP
>>> split AFAIK.
>>>
>>>>
>>>> But yeah, using the mapcount of a page that is not even mapped
>>>> (migration entry) is clearly wrong.
>>>>
>>>> To summarize: reading the mapcount on an unlocked page will easily
>>>> return a wrong result and the result should not be relied upon. reading
>>>> the mapcount of a migration entry is dangerous and certainly wrong.
>>>
>>> Depends on your usecase. Some just want to get a snapshot, just like
>>> smaps, they don't care.
>>
>> Right, but as discussed, even the snapshot might be slightly wrong. That
>> might be just fine for smaps (and I would have enjoyed a comment in the
>> code stating that :) ).
> 
> I think that is documented already, see Documentation/filesystems/proc.rst:
> 
> Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
> output can be achieved only in the single read call).

Right, but I think there is a difference between

* Atomic values that change immediately afterwards ("this value used to
  be true at one point in time")
* Values that are unstable because we cannot read them atomically ("this
  value never used to be true")

I'd assume with the documented race we actually talk about the first
point, but I might be just wrong.

> 
> Of course, if the extra note is preferred in the code, I could try to
> add some in a separate patch.

When staring at the (original) code I would have hoped to find something
like:

/*
 * We use page_mapcount() to get a snapshot of the mapcount. Without
 * holding the page lock this snapshot can be slightly wrong as we
 * cannot always read the mapcount atomically. As long we hold the PT
 * lock, the page cannot get unmapped and it's at safe to call
 * page_mapcount().
 */

With the addition of

"... For unmapped pages (e.g., migration entries) we cannot guarantee
that, so treat the mapcount as being 1."

But this is just my personal preference ... :) I do think the patch does
the right thing in regard to migration entries.
Yang Shi Jan. 27, 2022, 7:42 p.m. UTC | #12
On Wed, Jan 26, 2022 at 10:54 AM David Hildenbrand <david@redhat.com> wrote:
>
> >>> Just page lock or elevated page refcount could serialize against THP
> >>> split AFAIK.
> >>>
> >>>>
> >>>> But yeah, using the mapcount of a page that is not even mapped
> >>>> (migration entry) is clearly wrong.
> >>>>
> >>>> To summarize: reading the mapcount on an unlocked page will easily
> >>>> return a wrong result and the result should not be relied upon. reading
> >>>> the mapcount of a migration entry is dangerous and certainly wrong.
> >>>
> >>> Depends on your usecase. Some just want to get a snapshot, just like
> >>> smaps, they don't care.
> >>
> >> Right, but as discussed, even the snapshot might be slightly wrong. That
> >> might be just fine for smaps (and I would have enjoyed a comment in the
> >> code stating that :) ).
> >
> > I think that is documented already, see Documentation/filesystems/proc.rst:
> >
> > Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
> > output can be achieved only in the single read call).
>
> Right, but I think there is a difference between
>
> * Atomic values that change immediately afterwards ("this value used to
>   be true at one point in time")
> * Values that are unstable because we cannot read them atomically ("this
>   value never used to be true")
>
> I'd assume with the documented race we actually talk about the first
> point, but I might be just wrong.

I think so too.

>
> >
> > Of course, if the extra note is preferred in the code, I could try to
> > add some in a separate patch.
>
> When staring at the (original) code I would have hoped to find something
> like:
>
> /*
>  * We use page_mapcount() to get a snapshot of the mapcount. Without
>  * holding the page lock this snapshot can be slightly wrong as we
>  * cannot always read the mapcount atomically. As long we hold the PT
>  * lock, the page cannot get unmapped and it's at safe to call
>  * page_mapcount().
>  */
>
> With the addition of
>
> "... For unmapped pages (e.g., migration entries) we cannot guarantee
> that, so treat the mapcount as being 1."
>
> But this is just my personal preference ... :) I do think the patch does
> the right thing in regard to migration entries.

I will prepare a patch.

>
> --
> Thanks,
>
> David / dhildenb
>
Yang Shi Jan. 27, 2022, 8:51 p.m. UTC | #13
On Wed, Jan 26, 2022 at 3:29 AM Jann Horn <jannh@google.com> wrote:
>
> On Wed, Jan 26, 2022 at 11:51 AM David Hildenbrand <david@redhat.com> wrote:
> > On 20.01.22 21:28, Yang Shi wrote:
> > > The syzbot reported the below BUG:
> > >
> > > kernel BUG at include/linux/page-flags.h:785!
> > > invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> > > CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
> > > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> > > RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> > > RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> > > Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> > > RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> > > RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> > > RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> > > RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> > > R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> > > R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> > > FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> > > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > Call Trace:
> > >  <TASK>
> > >  page_mapcount include/linux/mm.h:837 [inline]
> > >  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
> > >  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
> > >  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
> > >  walk_pmd_range mm/pagewalk.c:128 [inline]
> > >  walk_pud_range mm/pagewalk.c:205 [inline]
> > >  walk_p4d_range mm/pagewalk.c:240 [inline]
> > >  walk_pgd_range mm/pagewalk.c:277 [inline]
> > >  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
> > >  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
> > >  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
> > >  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
> > >  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
> > >  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
> > >  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
> > >  vfs_read+0x1b5/0x600 fs/read_write.c:479
> > >  ksys_read+0x12d/0x250 fs/read_write.c:619
> > >  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
> > >  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
> > >  entry_SYSCALL_64_after_hwframe+0x44/0xae
> > > RIP: 0033:0x7faa2af6c969
> > > Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> > > RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> > > RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
> > > RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
> > > RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
> > > R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
> > > R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
> > >  </TASK>
> > > Modules linked in:
> > > ---[ end trace 24ec93ff95e4ac3d ]---
> > > RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> > > RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> > > Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> > > RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> > > RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> > > RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> > > RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> > > R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> > > R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> > > FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> > > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > >
> >
> > Does this point at the bigger issue that reading the mapcount without
> > having the page locked is completely unstable?
>
> (See also https://lore.kernel.org/all/CAG48ez0M=iwJu=Q8yUQHD-+eZDg6ZF8QCF86Sb=CN1petP=Y0Q@mail.gmail.com/
> for context.)

Thanks for the link. I saw you mentioned that pagemap code may have
the same issue. I think so too by inspecting the code visually even
though I didn't see any bug report. I think it could be fixed with the
same solution by just specializing the migration entry without calling
page_mapcount().

>
> I'm not sure what you mean by "unstable". Do you mean "the result is
> not guaranteed to still be valid when the call returns", "the result
> might not have ever been valid", or "the call might crash because the
> page's state as a compound page is unstable"?
>
> In case you mean "the result is not guaranteed to still be valid when
> the call returns":
> We're just collecting stats for userspace, and by the time we return
> to userspace, the numbers will be outdated anyway, so that doesn't
> matter.
>
> In case you mean "the result might not have ever been valid":
> Yes, even with this patch applied, in theory concurrent THP splits
> could cause us to count some page mappings twice. Arguably that's not
> entirely correct.
>
> In case you mean "the call might crash because the page's state as a
> compound page could concurrently change":
> As long as we have our own mapping of the page, the page can't be
> split, so this patch fixes that problem.
Yang Shi Jan. 27, 2022, 9:16 p.m. UTC | #14
On Wed, Jan 26, 2022 at 10:54 AM David Hildenbrand <david@redhat.com> wrote:
>
> >>> Just page lock or elevated page refcount could serialize against THP
> >>> split AFAIK.
> >>>
> >>>>
> >>>> But yeah, using the mapcount of a page that is not even mapped
> >>>> (migration entry) is clearly wrong.
> >>>>
> >>>> To summarize: reading the mapcount on an unlocked page will easily
> >>>> return a wrong result and the result should not be relied upon. reading
> >>>> the mapcount of a migration entry is dangerous and certainly wrong.
> >>>
> >>> Depends on your usecase. Some just want to get a snapshot, just like
> >>> smaps, they don't care.
> >>
> >> Right, but as discussed, even the snapshot might be slightly wrong. That
> >> might be just fine for smaps (and I would have enjoyed a comment in the
> >> code stating that :) ).
> >
> > I think that is documented already, see Documentation/filesystems/proc.rst:
> >
> > Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
> > output can be achieved only in the single read call).
>
> Right, but I think there is a difference between
>
> * Atomic values that change immediately afterwards ("this value used to
>   be true at one point in time")
> * Values that are unstable because we cannot read them atomically ("this
>   value never used to be true")
>
> I'd assume with the documented race we actually talk about the first
> point, but I might be just wrong.
>
> >
> > Of course, if the extra note is preferred in the code, I could try to
> > add some in a separate patch.
>
> When staring at the (original) code I would have hoped to find something
> like:
>
> /*
>  * We use page_mapcount() to get a snapshot of the mapcount. Without
>  * holding the page lock this snapshot can be slightly wrong as we
>  * cannot always read the mapcount atomically. As long we hold the PT
>  * lock, the page cannot get unmapped and it's at safe to call
>  * page_mapcount().
>  */
>
> With the addition of
>
> "... For unmapped pages (e.g., migration entries) we cannot guarantee
> that, so treat the mapcount as being 1."

It seems a little bit confusing to me, it is not safe to call with PTL
held either, right? I'd like to rephrase the note to:

/*
         * The page_mapcount() is called to get a snapshot of the mapcount.
         * Without holding the page lock this snapshot can be slightly wrong as
         * we cannot always read the mapcount atomically.  Holding PTL doesn't
         * guarantee calling page_mapcount() is safe for all cases either, for
         * example, migration entries.
         */

>
> But this is just my personal preference ... :) I do think the patch does
> the right thing in regard to migration entries.
>
> --
> Thanks,
>
> David / dhildenb
>
David Hildenbrand Jan. 28, 2022, 8:02 a.m. UTC | #15
On 27.01.22 22:16, Yang Shi wrote:
> On Wed, Jan 26, 2022 at 10:54 AM David Hildenbrand <david@redhat.com> wrote:
>>
>>>>> Just page lock or elevated page refcount could serialize against THP
>>>>> split AFAIK.
>>>>>
>>>>>>
>>>>>> But yeah, using the mapcount of a page that is not even mapped
>>>>>> (migration entry) is clearly wrong.
>>>>>>
>>>>>> To summarize: reading the mapcount on an unlocked page will easily
>>>>>> return a wrong result and the result should not be relied upon. reading
>>>>>> the mapcount of a migration entry is dangerous and certainly wrong.
>>>>>
>>>>> Depends on your usecase. Some just want to get a snapshot, just like
>>>>> smaps, they don't care.
>>>>
>>>> Right, but as discussed, even the snapshot might be slightly wrong. That
>>>> might be just fine for smaps (and I would have enjoyed a comment in the
>>>> code stating that :) ).
>>>
>>> I think that is documented already, see Documentation/filesystems/proc.rst:
>>>
>>> Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
>>> output can be achieved only in the single read call).
>>
>> Right, but I think there is a difference between
>>
>> * Atomic values that change immediately afterwards ("this value used to
>>   be true at one point in time")
>> * Values that are unstable because we cannot read them atomically ("this
>>   value never used to be true")
>>
>> I'd assume with the documented race we actually talk about the first
>> point, but I might be just wrong.
>>
>>>
>>> Of course, if the extra note is preferred in the code, I could try to
>>> add some in a separate patch.
>>
>> When staring at the (original) code I would have hoped to find something
>> like:
>>
>> /*
>>  * We use page_mapcount() to get a snapshot of the mapcount. Without
>>  * holding the page lock this snapshot can be slightly wrong as we
>>  * cannot always read the mapcount atomically. As long we hold the PT
>>  * lock, the page cannot get unmapped and it's at safe to call
>>  * page_mapcount().
>>  */
>>
>> With the addition of
>>
>> "... For unmapped pages (e.g., migration entries) we cannot guarantee
>> that, so treat the mapcount as being 1."
> 
> It seems a little bit confusing to me, it is not safe to call with PTL
> held either, right? I'd like to rephrase the note to:

The implication that could have been spelled out is that only a mapped
page can get unmapped. (I know, there are some weird migration entries
nowadays ...)

/*
 * We use page_mapcount() to get a snapshot of the mapcount. Without
 * holding the page lock this snapshot can be slightly wrong as we
 * cannot always read the mapcount atomically. As long we hold the PT
 * lock, a mapped page cannot get unmapped and it's at safe to call
 * page_mapcount(). Especially for migration entries, it's not safe to
 * call page_mapcount(), so we treat the mapcount as being 1.
 */
Yang Shi Jan. 28, 2022, 4:53 p.m. UTC | #16
On Fri, Jan 28, 2022 at 12:02 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 27.01.22 22:16, Yang Shi wrote:
> > On Wed, Jan 26, 2022 at 10:54 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >>>>> Just page lock or elevated page refcount could serialize against THP
> >>>>> split AFAIK.
> >>>>>
> >>>>>>
> >>>>>> But yeah, using the mapcount of a page that is not even mapped
> >>>>>> (migration entry) is clearly wrong.
> >>>>>>
> >>>>>> To summarize: reading the mapcount on an unlocked page will easily
> >>>>>> return a wrong result and the result should not be relied upon. reading
> >>>>>> the mapcount of a migration entry is dangerous and certainly wrong.
> >>>>>
> >>>>> Depends on your usecase. Some just want to get a snapshot, just like
> >>>>> smaps, they don't care.
> >>>>
> >>>> Right, but as discussed, even the snapshot might be slightly wrong. That
> >>>> might be just fine for smaps (and I would have enjoyed a comment in the
> >>>> code stating that :) ).
> >>>
> >>> I think that is documented already, see Documentation/filesystems/proc.rst:
> >>>
> >>> Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
> >>> output can be achieved only in the single read call).
> >>
> >> Right, but I think there is a difference between
> >>
> >> * Atomic values that change immediately afterwards ("this value used to
> >>   be true at one point in time")
> >> * Values that are unstable because we cannot read them atomically ("this
> >>   value never used to be true")
> >>
> >> I'd assume with the documented race we actually talk about the first
> >> point, but I might be just wrong.
> >>
> >>>
> >>> Of course, if the extra note is preferred in the code, I could try to
> >>> add some in a separate patch.
> >>
> >> When staring at the (original) code I would have hoped to find something
> >> like:
> >>
> >> /*
> >>  * We use page_mapcount() to get a snapshot of the mapcount. Without
> >>  * holding the page lock this snapshot can be slightly wrong as we
> >>  * cannot always read the mapcount atomically. As long we hold the PT
> >>  * lock, the page cannot get unmapped and it's at safe to call
> >>  * page_mapcount().
> >>  */
> >>
> >> With the addition of
> >>
> >> "... For unmapped pages (e.g., migration entries) we cannot guarantee
> >> that, so treat the mapcount as being 1."
> >
> > It seems a little bit confusing to me, it is not safe to call with PTL
> > held either, right? I'd like to rephrase the note to:
>
> The implication that could have been spelled out is that only a mapped
> page can get unmapped. (I know, there are some weird migration entries
> nowadays ...)

Yes, I see your point. Just felt "only a mapped page can get unmapped"
sounds not that straightforward (just my personal feeling). How's
about "It is not safe to call page_mapcount() even with PTL held if
the page is not mapped, especially for migration entries".

>
> /*
>  * We use page_mapcount() to get a snapshot of the mapcount. Without
>  * holding the page lock this snapshot can be slightly wrong as we
>  * cannot always read the mapcount atomically. As long we hold the PT
>  * lock, a mapped page cannot get unmapped and it's at safe to call
>  * page_mapcount(). Especially for migration entries, it's not safe to
>  * call page_mapcount(), so we treat the mapcount as being 1.
>  */
>
>
>
> --
> Thanks,
>
> David / dhildenb
>
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18f8c3acbb85..2bb567014d77 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -440,7 +440,8 @@  static void smaps_page_accumulate(struct mem_size_stats *mss,
 }
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		bool compound, bool young, bool dirty, bool locked)
+		bool compound, bool young, bool dirty, bool locked,
+		bool migration)
 {
 	int i, nr = compound ? compound_nr(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
@@ -467,8 +468,12 @@  static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	 * page_count(page) == 1 guarantees the page is mapped exactly once.
 	 * If any subpage of the compound page mapped with PTE it would elevate
 	 * page_count().
+	 *
+	 * Treated regular migration entries as mapcount == 1 without reading
+	 * mapcount since calling page_mapcount() for migration entries is
+	 * racy against THP splitting.
 	 */
-	if (page_count(page) == 1) {
+	if ((page_count(page) == 1) || migration) {
 		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
 			locked, true);
 		return;
@@ -517,6 +522,7 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	struct vm_area_struct *vma = walk->vma;
 	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pte_present(*pte)) {
 		page = vm_normal_page(vma, addr, *pte);
@@ -536,8 +542,11 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			} else {
 				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
 			}
-		} else if (is_pfn_swap_entry(swpent))
+		} else if (is_pfn_swap_entry(swpent)) {
+			if (is_migration_entry(swpent))
+				migration = true;
 			page = pfn_swap_entry_to_page(swpent);
+		}
 	} else {
 		smaps_pte_hole_lookup(addr, walk);
 		return;
@@ -546,7 +555,8 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	if (!page)
 		return;
 
-	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+		      locked, migration);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -557,6 +567,7 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	struct vm_area_struct *vma = walk->vma;
 	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pmd_present(*pmd)) {
 		/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -564,8 +575,10 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
-		if (is_migration_entry(entry))
+		if (is_migration_entry(entry)) {
+			migration = true;
 			page = pfn_swap_entry_to_page(entry);
+		}
 	}
 	if (IS_ERR_OR_NULL(page))
 		return;
@@ -577,7 +590,9 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		/* pass */;
 	else
 		mss->file_thp += HPAGE_PMD_SIZE;
-	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+		      locked, migration);
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,