diff mbox series

[v3,1/6] mm: free zapped tail pages when splitting isolated thp

Message ID 20240813120328.1275952-2-usamaarif642@gmail.com (mailing list archive)
State New
Headers show
Series mm: split underutilized THPs | expand

Commit Message

Usama Arif Aug. 13, 2024, 12:02 p.m. UTC
From: Yu Zhao <yuzhao@google.com>

If a tail page has only two references left, one inherited from the
isolation of its head and the other from lru_add_page_tail() which we
are about to drop, it means this tail page was concurrently zapped.
Then we can safely free it and save page reclaim or migration the
trouble of trying it.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/huge_memory.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

Comments

Kairui Song Aug. 15, 2024, 6:47 p.m. UTC | #1
On Tue, Aug 13, 2024 at 8:03 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
> From: Yu Zhao <yuzhao@google.com>
>
> If a tail page has only two references left, one inherited from the
> isolation of its head and the other from lru_add_page_tail() which we
> are about to drop, it means this tail page was concurrently zapped.
> Then we can safely free it and save page reclaim or migration the
> trouble of trying it.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/huge_memory.c | 27 +++++++++++++++++++++++++++
>  1 file changed, 27 insertions(+)

Hi, Usama, Yu

This commit is causing the kernel to panic very quickly with build
kernel test on top of tmpfs with all mTHP enabled, the panic comes
after:

[  207.147705] BUG: Bad page state in process tar  pfn:14ae70
[  207.149376] page: refcount:3 mapcount:2 mapping:0000000000000000
index:0x562d23b70 pfn:0x14ae70
[  207.151750] flags:
0x17ffffc0020019(locked|uptodate|dirty|swapbacked|node=0|zone=2|lastcpupid=0x1fffff)
[  207.154325] raw: 0017ffffc0020019 dead000000000100 dead000000000122
0000000000000000
[  207.156442] raw: 0000000562d23b70 0000000000000000 0000000300000001
0000000000000000
[  207.158561] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[  207.160325] Modules linked in:
[  207.161194] CPU: 22 UID: 0 PID: 2650 Comm: tar Not tainted
6.11.0-rc3.ptch+ #136
[  207.163198] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
[  207.164946] Call Trace:
[  207.165636]  <TASK>
[  207.166226]  dump_stack_lvl+0x53/0x70
[  207.167241]  bad_page+0x70/0x120
[  207.168131]  free_page_is_bad+0x5f/0x70
[  207.169193]  free_unref_folios+0x3a5/0x620
[  207.170320]  ? __mem_cgroup_uncharge_folios+0x7e/0xa0
[  207.171705]  __split_huge_page+0xb02/0xcf0
[  207.172839]  ? smp_call_function_many_cond+0x105/0x4b0
[  207.174250]  ? __pfx_flush_tlb_func+0x10/0x10
[  207.175410]  ? on_each_cpu_cond_mask+0x29/0x50
[  207.176603]  split_huge_page_to_list_to_order+0x857/0x9b0
[  207.178052]  shrink_folio_list+0x4e1/0x1200
[  207.179198]  evict_folios+0x468/0xab0
[  207.180202]  try_to_shrink_lruvec+0x1f3/0x280
[  207.181394]  shrink_lruvec+0x89/0x780
[  207.182398]  ? mem_cgroup_iter+0x66/0x290
[  207.183488]  shrink_node+0x243/0xb00
[  207.184474]  do_try_to_free_pages+0xbd/0x4e0
[  207.185621]  try_to_free_mem_cgroup_pages+0x107/0x230
[  207.186994]  try_charge_memcg+0x184/0x5d0
[  207.188092]  charge_memcg+0x3a/0x60
[  207.189046]  __mem_cgroup_charge+0x2c/0x80
[  207.190162]  shmem_alloc_and_add_folio+0x1a3/0x470
[  207.191469]  shmem_get_folio_gfp+0x24a/0x670
[  207.192635]  shmem_write_begin+0x56/0xd0
[  207.193703]  generic_perform_write+0x140/0x330
[  207.194919]  shmem_file_write_iter+0x89/0x90
[  207.196082]  vfs_write+0x2f3/0x420
[  207.197019]  ksys_write+0x5d/0xd0
[  207.197914]  do_syscall_64+0x47/0x110
[  207.198915]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  207.200293] RIP: 0033:0x7f2e6099c784
[  207.201278] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f
84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 08 0e 00 00 74 13 b8 01 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 $
8 89 e5 48 83 ec 20 48 89
[  207.206280] RSP: 002b:00007ffdb1a0e7d8 EFLAGS: 00000202 ORIG_RAX:
0000000000000001
[  207.208312] RAX: ffffffffffffffda RBX: 00000000000005e7 RCX: 00007f2e6099c784
[  207.210225] RDX: 00000000000005e7 RSI: 0000562d23b77000 RDI: 0000000000000004
[  207.212145] RBP: 00007ffdb1a0e820 R08: 00000000000005e7 R09: 0000000000000007
[  207.214064] R10: 0000000000000180 R11: 0000000000000202 R12: 0000562d23b77000
[  207.215974] R13: 0000000000000004 R14: 00000000000005e7 R15: 0000000000000000
[  207.217888]  </TASK>

Test is done using ZRAM as SWAP, 1G memcg, and run:
cd /mnt/tmpfs
time tar zxf "$linux_src"
make -j64 clean
make defconfig
/usr/bin/time make -j64

>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 04ee8abd6475..85a424e954be 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3059,7 +3059,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>         unsigned int new_nr = 1 << new_order;
>         int order = folio_order(folio);
>         unsigned int nr = 1 << order;
> +       struct folio_batch free_folios;
>
> +       folio_batch_init(&free_folios);
>         /* complete memcg works before add pages to LRU */
>         split_page_memcg(head, order, new_order);
>
> @@ -3143,6 +3145,26 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>                 if (subpage == page)
>                         continue;
>                 folio_unlock(new_folio);
> +               /*
> +                * If a folio has only two references left, one inherited
> +                * from the isolation of its head and the other from
> +                * lru_add_page_tail() which we are about to drop, it means this
> +                * folio was concurrently zapped. Then we can safely free it
> +                * and save page reclaim or migration the trouble of trying it.
> +                */
> +               if (list && folio_ref_freeze(new_folio, 2)) {
> +                       VM_WARN_ON_ONCE_FOLIO(folio_test_lru(new_folio), new_folio);
> +                       VM_WARN_ON_ONCE_FOLIO(folio_test_large(new_folio), new_folio);
> +                       VM_WARN_ON_ONCE_FOLIO(folio_mapped(new_folio), new_folio);
> +
> +                       folio_clear_active(new_folio);
> +                       folio_clear_unevictable(new_folio);
> +                       if (!folio_batch_add(&free_folios, folio)) {
> +                               mem_cgroup_uncharge_folios(&free_folios);
> +                               free_unref_folios(&free_folios);
> +                       }
> +                       continue;
> +               }
>
>                 /*
>                  * Subpages may be freed if there wasn't any mapping
> @@ -3153,6 +3175,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>                  */
>                 free_page_and_swap_cache(subpage);
>         }
> +
> +       if (free_folios.nr) {
> +               mem_cgroup_uncharge_folios(&free_folios);
> +               free_unref_folios(&free_folios);
> +       }
>  }
>
>  /* Racy check whether the huge page can be split */
> --
> 2.43.5
>
>
Usama Arif Aug. 15, 2024, 7:16 p.m. UTC | #2
On 15/08/2024 19:47, Kairui Song wrote:
> On Tue, Aug 13, 2024 at 8:03 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> From: Yu Zhao <yuzhao@google.com>
>>
>> If a tail page has only two references left, one inherited from the
>> isolation of its head and the other from lru_add_page_tail() which we
>> are about to drop, it means this tail page was concurrently zapped.
>> Then we can safely free it and save page reclaim or migration the
>> trouble of trying it.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
>> ---
>>  mm/huge_memory.c | 27 +++++++++++++++++++++++++++
>>  1 file changed, 27 insertions(+)
> 
> Hi, Usama, Yu
> 
> This commit is causing the kernel to panic very quickly with build
> kernel test on top of tmpfs with all mTHP enabled, the panic comes
> after:
> 

Hi,

Thanks for pointing this out. It is a very silly bug I have introduced going from v1 page version to the folio version of the patch in v3.

Doing below over this patch will fix it:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 907813102430..a6ca454e1168 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3183,7 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
                        folio_clear_active(new_folio);
                        folio_clear_unevictable(new_folio);
-                       if (!folio_batch_add(&free_folios, folio)) {
+                       if (!folio_batch_add(&free_folios, new_folio)) {
                                mem_cgroup_uncharge_folios(&free_folios);
                                free_unref_folios(&free_folios);
                        }


I will include it in the next revision.

> [  207.147705] BUG: Bad page state in process tar  pfn:14ae70
> [  207.149376] page: refcount:3 mapcount:2 mapping:0000000000000000
> index:0x562d23b70 pfn:0x14ae70
> [  207.151750] flags:
> 0x17ffffc0020019(locked|uptodate|dirty|swapbacked|node=0|zone=2|lastcpupid=0x1fffff)
> [  207.154325] raw: 0017ffffc0020019 dead000000000100 dead000000000122
> 0000000000000000
> [  207.156442] raw: 0000000562d23b70 0000000000000000 0000000300000001
> 0000000000000000
> [  207.158561] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
> [  207.160325] Modules linked in:
> [  207.161194] CPU: 22 UID: 0 PID: 2650 Comm: tar Not tainted
> 6.11.0-rc3.ptch+ #136
> [  207.163198] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
> [  207.164946] Call Trace:
> [  207.165636]  <TASK>
> [  207.166226]  dump_stack_lvl+0x53/0x70
> [  207.167241]  bad_page+0x70/0x120
> [  207.168131]  free_page_is_bad+0x5f/0x70
> [  207.169193]  free_unref_folios+0x3a5/0x620
> [  207.170320]  ? __mem_cgroup_uncharge_folios+0x7e/0xa0
> [  207.171705]  __split_huge_page+0xb02/0xcf0
> [  207.172839]  ? smp_call_function_many_cond+0x105/0x4b0
> [  207.174250]  ? __pfx_flush_tlb_func+0x10/0x10
> [  207.175410]  ? on_each_cpu_cond_mask+0x29/0x50
> [  207.176603]  split_huge_page_to_list_to_order+0x857/0x9b0
> [  207.178052]  shrink_folio_list+0x4e1/0x1200
> [  207.179198]  evict_folios+0x468/0xab0
> [  207.180202]  try_to_shrink_lruvec+0x1f3/0x280
> [  207.181394]  shrink_lruvec+0x89/0x780
> [  207.182398]  ? mem_cgroup_iter+0x66/0x290
> [  207.183488]  shrink_node+0x243/0xb00
> [  207.184474]  do_try_to_free_pages+0xbd/0x4e0
> [  207.185621]  try_to_free_mem_cgroup_pages+0x107/0x230
> [  207.186994]  try_charge_memcg+0x184/0x5d0
> [  207.188092]  charge_memcg+0x3a/0x60
> [  207.189046]  __mem_cgroup_charge+0x2c/0x80
> [  207.190162]  shmem_alloc_and_add_folio+0x1a3/0x470
> [  207.191469]  shmem_get_folio_gfp+0x24a/0x670
> [  207.192635]  shmem_write_begin+0x56/0xd0
> [  207.193703]  generic_perform_write+0x140/0x330
> [  207.194919]  shmem_file_write_iter+0x89/0x90
> [  207.196082]  vfs_write+0x2f3/0x420
> [  207.197019]  ksys_write+0x5d/0xd0
> [  207.197914]  do_syscall_64+0x47/0x110
> [  207.198915]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  207.200293] RIP: 0033:0x7f2e6099c784
> [  207.201278] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f
> 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 08 0e 00 00 74 13 b8 01 00 00
> 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 $
> 8 89 e5 48 83 ec 20 48 89
> [  207.206280] RSP: 002b:00007ffdb1a0e7d8 EFLAGS: 00000202 ORIG_RAX:
> 0000000000000001
> [  207.208312] RAX: ffffffffffffffda RBX: 00000000000005e7 RCX: 00007f2e6099c784
> [  207.210225] RDX: 00000000000005e7 RSI: 0000562d23b77000 RDI: 0000000000000004
> [  207.212145] RBP: 00007ffdb1a0e820 R08: 00000000000005e7 R09: 0000000000000007
> [  207.214064] R10: 0000000000000180 R11: 0000000000000202 R12: 0000562d23b77000
> [  207.215974] R13: 0000000000000004 R14: 00000000000005e7 R15: 0000000000000000
> [  207.217888]  </TASK>
> 
> Test is done using ZRAM as SWAP, 1G memcg, and run:
> cd /mnt/tmpfs
> time tar zxf "$linux_src"
> make -j64 clean
> make defconfig
> /usr/bin/time make -j64
> 
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 04ee8abd6475..85a424e954be 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3059,7 +3059,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>         unsigned int new_nr = 1 << new_order;
>>         int order = folio_order(folio);
>>         unsigned int nr = 1 << order;
>> +       struct folio_batch free_folios;
>>
>> +       folio_batch_init(&free_folios);
>>         /* complete memcg works before add pages to LRU */
>>         split_page_memcg(head, order, new_order);
>>
>> @@ -3143,6 +3145,26 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>                 if (subpage == page)
>>                         continue;
>>                 folio_unlock(new_folio);
>> +               /*
>> +                * If a folio has only two references left, one inherited
>> +                * from the isolation of its head and the other from
>> +                * lru_add_page_tail() which we are about to drop, it means this
>> +                * folio was concurrently zapped. Then we can safely free it
>> +                * and save page reclaim or migration the trouble of trying it.
>> +                */
>> +               if (list && folio_ref_freeze(new_folio, 2)) {
>> +                       VM_WARN_ON_ONCE_FOLIO(folio_test_lru(new_folio), new_folio);
>> +                       VM_WARN_ON_ONCE_FOLIO(folio_test_large(new_folio), new_folio);
>> +                       VM_WARN_ON_ONCE_FOLIO(folio_mapped(new_folio), new_folio);
>> +
>> +                       folio_clear_active(new_folio);
>> +                       folio_clear_unevictable(new_folio);
>> +                       if (!folio_batch_add(&free_folios, folio)) {
>> +                               mem_cgroup_uncharge_folios(&free_folios);
>> +                               free_unref_folios(&free_folios);
>> +                       }
>> +                       continue;
>> +               }
>>
>>                 /*
>>                  * Subpages may be freed if there wasn't any mapping
>> @@ -3153,6 +3175,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>                  */
>>                 free_page_and_swap_cache(subpage);
>>         }
>> +
>> +       if (free_folios.nr) {
>> +               mem_cgroup_uncharge_folios(&free_folios);
>> +               free_unref_folios(&free_folios);
>> +       }
>>  }
>>
>>  /* Racy check whether the huge page can be split */
>> --
>> 2.43.5
>>
>>
Kairui Song Aug. 16, 2024, 4:55 p.m. UTC | #3
On Fri, Aug 16, 2024 at 3:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> On 15/08/2024 19:47, Kairui Song wrote:
> > On Tue, Aug 13, 2024 at 8:03 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> If a tail page has only two references left, one inherited from the
> >> isolation of its head and the other from lru_add_page_tail() which we
> >> are about to drop, it means this tail page was concurrently zapped.
> >> Then we can safely free it and save page reclaim or migration the
> >> trouble of trying it.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> >> ---
> >>  mm/huge_memory.c | 27 +++++++++++++++++++++++++++
> >>  1 file changed, 27 insertions(+)
> >
> > Hi, Usama, Yu
> >
> > This commit is causing the kernel to panic very quickly with build
> > kernel test on top of tmpfs with all mTHP enabled, the panic comes
> > after:
> >
>
> Hi,
>
> Thanks for pointing this out. It is a very silly bug I have introduced going from v1 page version to the folio version of the patch in v3.
>
> Doing below over this patch will fix it:
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 907813102430..a6ca454e1168 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3183,7 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>
>                         folio_clear_active(new_folio);
>                         folio_clear_unevictable(new_folio);
> -                       if (!folio_batch_add(&free_folios, folio)) {
> +                       if (!folio_batch_add(&free_folios, new_folio)) {
>                                 mem_cgroup_uncharge_folios(&free_folios);
>                                 free_unref_folios(&free_folios);
>                         }
>
>
> I will include it in the next revision.
>

Hi,

After the fix, I'm still seeing below panic:
[   24.926629] list_del corruption. prev->next should be
ffffea000491cf88, but was ffffea0006207708. (prev=ffffea000491cfc8)
[   24.930783] ------------[ cut here ]------------
[   24.932519] kernel BUG at lib/list_debug.c:64!
[   24.934325] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   24.936339] CPU: 32 UID: 0 PID: 2112 Comm: gzip Not tainted
6.11.0-rc3.ptch+ #147
[   24.938575] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
[   24.940680] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
[   24.942536] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
00 00
[   24.948418] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
[   24.949996] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
[   24.952293] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
[   24.954616] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
[   24.956908] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
[   24.959253] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
[   24.961410] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
knlGS:0000000000000000
[   24.963587] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   24.965112] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
[   24.967037] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   24.968933] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   24.970802] PKRU: 55555554
[   24.971559] Call Trace:
[   24.972241]  <TASK>
[   24.972805]  ? __die_body+0x1e/0x60
[   24.973756]  ? die+0x3c/0x60
[   24.974450]  ? do_trap+0xe8/0x110
[   24.975235]  ? __list_del_entry_valid_or_report+0xaa/0xc0
[   24.976543]  ? do_error_trap+0x65/0x80
[   24.977542]  ? __list_del_entry_valid_or_report+0xaa/0xc0
[   24.978891]  ? exc_invalid_op+0x50/0x70
[   24.979870]  ? __list_del_entry_valid_or_report+0xaa/0xc0
[   24.981295]  ? asm_exc_invalid_op+0x1a/0x20
[   24.982389]  ? __list_del_entry_valid_or_report+0xaa/0xc0
[   24.983781]  shrink_folio_list+0x39a/0x1200
[   24.984898]  shrink_inactive_list+0x1c0/0x420
[   24.986082]  shrink_lruvec+0x5db/0x780
[   24.987078]  shrink_node+0x243/0xb00
[   24.988063]  ? get_pfnblock_flags_mask.constprop.117+0x1d/0x50
[   24.989622]  do_try_to_free_pages+0xbd/0x4e0
[   24.990732]  try_to_free_mem_cgroup_pages+0x107/0x230
[   24.992034]  try_charge_memcg+0x184/0x5d0
[   24.993145]  obj_cgroup_charge_pages+0x38/0x110
[   24.994326]  __memcg_kmem_charge_page+0x8d/0xf0
[   24.995531]  __alloc_pages_noprof+0x278/0x360
[   24.996712]  alloc_pages_mpol_noprof+0xf0/0x230
[   24.997896]  pipe_write+0x2ad/0x5f0
[   24.998837]  ? __pfx_tick_nohz_handler+0x10/0x10
[   25.000234]  ? update_process_times+0x8c/0xa0
[   25.001377]  ? timerqueue_add+0x77/0x90
[   25.002257]  vfs_write+0x39b/0x420
[   25.003083]  ksys_write+0xbd/0xd0
[   25.003950]  do_syscall_64+0x47/0x110
[   25.004917]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   25.006210] RIP: 0033:0x7fe5a246f784
[   25.007149] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f
84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 08 0e 00 00 74 13 b8 01 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20
48 89
[   25.011961] RSP: 002b:00007ffdb0057b38 EFLAGS: 00000202 ORIG_RAX:
0000000000000001
[   25.013946] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe5a246f784
[   25.015817] RDX: 0000000000008000 RSI: 0000558c0d311420 RDI: 0000000000000001
[   25.017717] RBP: 00007ffdb0057b60 R08: 0000558c0d258c40 R09: 0000558c0d311420
[   25.019618] R10: 00007ffdb0057600 R11: 0000000000000202 R12: 0000000000008000
[   25.021519] R13: 0000558c0d311420 R14: 0000000000000029 R15: 0000000000001f8d
[   25.023412]  </TASK>
[   25.023998] Modules linked in:
[   25.024900] ---[ end trace 0000000000000000 ]---
[   25.026329] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
[   25.027885] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
00 00
[   25.032525] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
[   25.033892] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
[   25.035758] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
[   25.037661] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
[   25.039543] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
[   25.041426] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
[   25.043323] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
knlGS:0000000000000000
[   25.045478] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   25.047013] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
[   25.048935] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   25.050858] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   25.052881] PKRU: 55555554
[   25.053634] Kernel panic - not syncing: Fatal exception
[   25.056902] Kernel Offset: disabled
[   25.057827] ---[ end Kernel panic - not syncing: Fatal exception ]---

If I revert the fix and this patch, the panic is gone, let me know if
I can help debug it.
Usama Arif Aug. 16, 2024, 5:02 p.m. UTC | #4
On 16/08/2024 17:55, Kairui Song wrote:
> On Fri, Aug 16, 2024 at 3:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
>> On 15/08/2024 19:47, Kairui Song wrote:
>>> On Tue, Aug 13, 2024 at 8:03 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> From: Yu Zhao <yuzhao@google.com>
>>>>
>>>> If a tail page has only two references left, one inherited from the
>>>> isolation of its head and the other from lru_add_page_tail() which we
>>>> are about to drop, it means this tail page was concurrently zapped.
>>>> Then we can safely free it and save page reclaim or migration the
>>>> trouble of trying it.
>>>>
>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
>>>> ---
>>>>  mm/huge_memory.c | 27 +++++++++++++++++++++++++++
>>>>  1 file changed, 27 insertions(+)
>>>
>>> Hi, Usama, Yu
>>>
>>> This commit is causing the kernel to panic very quickly with build
>>> kernel test on top of tmpfs with all mTHP enabled, the panic comes
>>> after:
>>>
>>
>> Hi,
>>
>> Thanks for pointing this out. It is a very silly bug I have introduced going from v1 page version to the folio version of the patch in v3.
>>
>> Doing below over this patch will fix it:
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 907813102430..a6ca454e1168 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3183,7 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>
>>                         folio_clear_active(new_folio);
>>                         folio_clear_unevictable(new_folio);
>> -                       if (!folio_batch_add(&free_folios, folio)) {
>> +                       if (!folio_batch_add(&free_folios, new_folio)) {
>>                                 mem_cgroup_uncharge_folios(&free_folios);
>>                                 free_unref_folios(&free_folios);
>>                         }
>>
>>
>> I will include it in the next revision.
>>
> 
> Hi,
> 
> After the fix, I'm still seeing below panic:
> [   24.926629] list_del corruption. prev->next should be
> ffffea000491cf88, but was ffffea0006207708. (prev=ffffea000491cfc8)
> [   24.930783] ------------[ cut here ]------------
> [   24.932519] kernel BUG at lib/list_debug.c:64!
> [   24.934325] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [   24.936339] CPU: 32 UID: 0 PID: 2112 Comm: gzip Not tainted
> 6.11.0-rc3.ptch+ #147
> [   24.938575] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
> [   24.940680] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
> [   24.942536] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
> 36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
> 36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
> 00 00
> [   24.948418] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
> [   24.949996] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
> [   24.952293] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
> [   24.954616] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
> [   24.956908] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
> [   24.959253] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
> [   24.961410] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
> knlGS:0000000000000000
> [   24.963587] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   24.965112] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
> [   24.967037] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   24.968933] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   24.970802] PKRU: 55555554
> [   24.971559] Call Trace:
> [   24.972241]  <TASK>
> [   24.972805]  ? __die_body+0x1e/0x60
> [   24.973756]  ? die+0x3c/0x60
> [   24.974450]  ? do_trap+0xe8/0x110
> [   24.975235]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> [   24.976543]  ? do_error_trap+0x65/0x80
> [   24.977542]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> [   24.978891]  ? exc_invalid_op+0x50/0x70
> [   24.979870]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> [   24.981295]  ? asm_exc_invalid_op+0x1a/0x20
> [   24.982389]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> [   24.983781]  shrink_folio_list+0x39a/0x1200
> [   24.984898]  shrink_inactive_list+0x1c0/0x420
> [   24.986082]  shrink_lruvec+0x5db/0x780
> [   24.987078]  shrink_node+0x243/0xb00
> [   24.988063]  ? get_pfnblock_flags_mask.constprop.117+0x1d/0x50
> [   24.989622]  do_try_to_free_pages+0xbd/0x4e0
> [   24.990732]  try_to_free_mem_cgroup_pages+0x107/0x230
> [   24.992034]  try_charge_memcg+0x184/0x5d0
> [   24.993145]  obj_cgroup_charge_pages+0x38/0x110
> [   24.994326]  __memcg_kmem_charge_page+0x8d/0xf0
> [   24.995531]  __alloc_pages_noprof+0x278/0x360
> [   24.996712]  alloc_pages_mpol_noprof+0xf0/0x230
> [   24.997896]  pipe_write+0x2ad/0x5f0
> [   24.998837]  ? __pfx_tick_nohz_handler+0x10/0x10
> [   25.000234]  ? update_process_times+0x8c/0xa0
> [   25.001377]  ? timerqueue_add+0x77/0x90
> [   25.002257]  vfs_write+0x39b/0x420
> [   25.003083]  ksys_write+0xbd/0xd0
> [   25.003950]  do_syscall_64+0x47/0x110
> [   25.004917]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   25.006210] RIP: 0033:0x7fe5a246f784
> [   25.007149] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f
> 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 08 0e 00 00 74 13 b8 01 00 00
> 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20
> 48 89
> [   25.011961] RSP: 002b:00007ffdb0057b38 EFLAGS: 00000202 ORIG_RAX:
> 0000000000000001
> [   25.013946] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe5a246f784
> [   25.015817] RDX: 0000000000008000 RSI: 0000558c0d311420 RDI: 0000000000000001
> [   25.017717] RBP: 00007ffdb0057b60 R08: 0000558c0d258c40 R09: 0000558c0d311420
> [   25.019618] R10: 00007ffdb0057600 R11: 0000000000000202 R12: 0000000000008000
> [   25.021519] R13: 0000558c0d311420 R14: 0000000000000029 R15: 0000000000001f8d
> [   25.023412]  </TASK>
> [   25.023998] Modules linked in:
> [   25.024900] ---[ end trace 0000000000000000 ]---
> [   25.026329] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
> [   25.027885] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
> 36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
> 36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
> 00 00
> [   25.032525] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
> [   25.033892] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
> [   25.035758] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
> [   25.037661] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
> [   25.039543] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
> [   25.041426] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
> [   25.043323] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
> knlGS:0000000000000000
> [   25.045478] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   25.047013] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
> [   25.048935] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   25.050858] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   25.052881] PKRU: 55555554
> [   25.053634] Kernel panic - not syncing: Fatal exception
> [   25.056902] Kernel Offset: disabled
> [   25.057827] ---[ end Kernel panic - not syncing: Fatal exception ]---
> 
> If I revert the fix and this patch, the panic is gone, let me know if
> I can help debug it.

Yes, this is also needed to prevent race with shrink_folio:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a6ca454e1168..75f5b059e804 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3183,6 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
                        folio_clear_active(new_folio);
                        folio_clear_unevictable(new_folio);
+                       list_del(&new_folio->lru);
                        if (!folio_batch_add(&free_folios, new_folio)) {
                                mem_cgroup_uncharge_folios(&free_folios);
                                free_unref_folios(&free_folios);


I have tested this so should be ok, but let me know otherwise.

I will include this in the next revision I will send soon.

Thanks.
Kairui Song Aug. 16, 2024, 6:11 p.m. UTC | #5
On Sat, Aug 17, 2024 at 1:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
> On 16/08/2024 17:55, Kairui Song wrote:
> > On Fri, Aug 16, 2024 at 3:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >> On 15/08/2024 19:47, Kairui Song wrote:
> >>> On Tue, Aug 13, 2024 at 8:03 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>
> >>>> From: Yu Zhao <yuzhao@google.com>
> >>>>
> >>>> If a tail page has only two references left, one inherited from the
> >>>> isolation of its head and the other from lru_add_page_tail() which we
> >>>> are about to drop, it means this tail page was concurrently zapped.
> >>>> Then we can safely free it and save page reclaim or migration the
> >>>> trouble of trying it.
> >>>>
> >>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >>>> Tested-by: Shuang Zhai <zhais@google.com>
> >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> >>>> ---
> >>>>  mm/huge_memory.c | 27 +++++++++++++++++++++++++++
> >>>>  1 file changed, 27 insertions(+)
> >>>
> >>> Hi, Usama, Yu
> >>>
> >>> This commit is causing the kernel to panic very quickly with build
> >>> kernel test on top of tmpfs with all mTHP enabled, the panic comes
> >>> after:
> >>>
> >>
> >> Hi,
> >>
> >> Thanks for pointing this out. It is a very silly bug I have introduced going from v1 page version to the folio version of the patch in v3.
> >>
> >> Doing below over this patch will fix it:
> >>
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 907813102430..a6ca454e1168 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3183,7 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >>
> >>                         folio_clear_active(new_folio);
> >>                         folio_clear_unevictable(new_folio);
> >> -                       if (!folio_batch_add(&free_folios, folio)) {
> >> +                       if (!folio_batch_add(&free_folios, new_folio)) {
> >>                                 mem_cgroup_uncharge_folios(&free_folios);
> >>                                 free_unref_folios(&free_folios);
> >>                         }
> >>
> >>
> >> I will include it in the next revision.
> >>
> >
> > Hi,
> >
> > After the fix, I'm still seeing below panic:
> > [   24.926629] list_del corruption. prev->next should be
> > ffffea000491cf88, but was ffffea0006207708. (prev=ffffea000491cfc8)
> > [   24.930783] ------------[ cut here ]------------
> > [   24.932519] kernel BUG at lib/list_debug.c:64!
> > [   24.934325] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > [   24.936339] CPU: 32 UID: 0 PID: 2112 Comm: gzip Not tainted
> > 6.11.0-rc3.ptch+ #147
> > [   24.938575] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
> > [   24.940680] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
> > [   24.942536] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
> > 36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
> > 36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
> > 00 00
> > [   24.948418] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
> > [   24.949996] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
> > [   24.952293] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
> > [   24.954616] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
> > [   24.956908] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
> > [   24.959253] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
> > [   24.961410] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
> > knlGS:0000000000000000
> > [   24.963587] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   24.965112] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
> > [   24.967037] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [   24.968933] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [   24.970802] PKRU: 55555554
> > [   24.971559] Call Trace:
> > [   24.972241]  <TASK>
> > [   24.972805]  ? __die_body+0x1e/0x60
> > [   24.973756]  ? die+0x3c/0x60
> > [   24.974450]  ? do_trap+0xe8/0x110
> > [   24.975235]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> > [   24.976543]  ? do_error_trap+0x65/0x80
> > [   24.977542]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> > [   24.978891]  ? exc_invalid_op+0x50/0x70
> > [   24.979870]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> > [   24.981295]  ? asm_exc_invalid_op+0x1a/0x20
> > [   24.982389]  ? __list_del_entry_valid_or_report+0xaa/0xc0
> > [   24.983781]  shrink_folio_list+0x39a/0x1200
> > [   24.984898]  shrink_inactive_list+0x1c0/0x420
> > [   24.986082]  shrink_lruvec+0x5db/0x780
> > [   24.987078]  shrink_node+0x243/0xb00
> > [   24.988063]  ? get_pfnblock_flags_mask.constprop.117+0x1d/0x50
> > [   24.989622]  do_try_to_free_pages+0xbd/0x4e0
> > [   24.990732]  try_to_free_mem_cgroup_pages+0x107/0x230
> > [   24.992034]  try_charge_memcg+0x184/0x5d0
> > [   24.993145]  obj_cgroup_charge_pages+0x38/0x110
> > [   24.994326]  __memcg_kmem_charge_page+0x8d/0xf0
> > [   24.995531]  __alloc_pages_noprof+0x278/0x360
> > [   24.996712]  alloc_pages_mpol_noprof+0xf0/0x230
> > [   24.997896]  pipe_write+0x2ad/0x5f0
> > [   24.998837]  ? __pfx_tick_nohz_handler+0x10/0x10
> > [   25.000234]  ? update_process_times+0x8c/0xa0
> > [   25.001377]  ? timerqueue_add+0x77/0x90
> > [   25.002257]  vfs_write+0x39b/0x420
> > [   25.003083]  ksys_write+0xbd/0xd0
> > [   25.003950]  do_syscall_64+0x47/0x110
> > [   25.004917]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [   25.006210] RIP: 0033:0x7fe5a246f784
> > [   25.007149] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f
> > 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 08 0e 00 00 74 13 b8 01 00 00
> > 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20
> > 48 89
> > [   25.011961] RSP: 002b:00007ffdb0057b38 EFLAGS: 00000202 ORIG_RAX:
> > 0000000000000001
> > [   25.013946] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe5a246f784
> > [   25.015817] RDX: 0000000000008000 RSI: 0000558c0d311420 RDI: 0000000000000001
> > [   25.017717] RBP: 00007ffdb0057b60 R08: 0000558c0d258c40 R09: 0000558c0d311420
> > [   25.019618] R10: 00007ffdb0057600 R11: 0000000000000202 R12: 0000000000008000
> > [   25.021519] R13: 0000558c0d311420 R14: 0000000000000029 R15: 0000000000001f8d
> > [   25.023412]  </TASK>
> > [   25.023998] Modules linked in:
> > [   25.024900] ---[ end trace 0000000000000000 ]---
> > [   25.026329] RIP: 0010:__list_del_entry_valid_or_report+0xaa/0xc0
> > [   25.027885] Code: 8c ff 0f 0b 48 89 fe 48 c7 c7 f8 9d 51 82 e8 9d
> > 36 8c ff 0f 0b 48 89 d1 48 89 f2 48 89 fe 48 c7 c7 30 9e 51 82 e8 86
> > 36 8c ff <0f> 0b 48 c7 c7 80 9e 51 82 e8 78 36 8c ff 0f 0b 66 0f 1f 44
> > 00 00
> > [   25.032525] RSP: 0018:ffffc90005c2b770 EFLAGS: 00010246
> > [   25.033892] RAX: 000000000000006d RBX: ffffea000491cf88 RCX: 0000000000000000
> > [   25.035758] RDX: 0000000000000000 RSI: ffff889ffee1c180 RDI: ffff889ffee1c180
> > [   25.037661] RBP: ffffea000491cf80 R08: 0000000000000000 R09: c0000000ffff7fff
> > [   25.039543] R10: 0000000000000001 R11: ffffc90005c2b5a8 R12: ffffc90005c2b954
> > [   25.041426] R13: ffffc90005c2bbc0 R14: ffffc90005c2b7c0 R15: ffffc90005c2b940
> > [   25.043323] FS:  00007fe5a235e740(0000) GS:ffff889ffee00000(0000)
> > knlGS:0000000000000000
> > [   25.045478] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   25.047013] CR2: 00007fe5a24ddcd0 CR3: 000000010cb40001 CR4: 0000000000770eb0
> > [   25.048935] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [   25.050858] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [   25.052881] PKRU: 55555554
> > [   25.053634] Kernel panic - not syncing: Fatal exception
> > [   25.056902] Kernel Offset: disabled
> > [   25.057827] ---[ end Kernel panic - not syncing: Fatal exception ]---
> >
> > If I revert the fix and this patch, the panic is gone, let me know if
> > I can help debug it.
>
> Yes, this is also needed to prevent race with shrink_folio:
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a6ca454e1168..75f5b059e804 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3183,6 +3183,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>
>                         folio_clear_active(new_folio);
>                         folio_clear_unevictable(new_folio);
> +                       list_del(&new_folio->lru);
>                         if (!folio_batch_add(&free_folios, new_folio)) {
>                                 mem_cgroup_uncharge_folios(&free_folios);
>                                 free_unref_folios(&free_folios);
>
>
> I have tested this so should be ok, but let me know otherwise.
>
> I will include this in the next revision I will send soon.
>
> Thanks.

Thanks for the update, the panic problem is gone.
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 04ee8abd6475..85a424e954be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3059,7 +3059,9 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 	unsigned int new_nr = 1 << new_order;
 	int order = folio_order(folio);
 	unsigned int nr = 1 << order;
+	struct folio_batch free_folios;
 
+	folio_batch_init(&free_folios);
 	/* complete memcg works before add pages to LRU */
 	split_page_memcg(head, order, new_order);
 
@@ -3143,6 +3145,26 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 		if (subpage == page)
 			continue;
 		folio_unlock(new_folio);
+		/*
+		 * If a folio has only two references left, one inherited
+		 * from the isolation of its head and the other from
+		 * lru_add_page_tail() which we are about to drop, it means this
+		 * folio was concurrently zapped. Then we can safely free it
+		 * and save page reclaim or migration the trouble of trying it.
+		 */
+		if (list && folio_ref_freeze(new_folio, 2)) {
+			VM_WARN_ON_ONCE_FOLIO(folio_test_lru(new_folio), new_folio);
+			VM_WARN_ON_ONCE_FOLIO(folio_test_large(new_folio), new_folio);
+			VM_WARN_ON_ONCE_FOLIO(folio_mapped(new_folio), new_folio);
+
+			folio_clear_active(new_folio);
+			folio_clear_unevictable(new_folio);
+			if (!folio_batch_add(&free_folios, folio)) {
+				mem_cgroup_uncharge_folios(&free_folios);
+				free_unref_folios(&free_folios);
+			}
+			continue;
+		}
 
 		/*
 		 * Subpages may be freed if there wasn't any mapping
@@ -3153,6 +3175,11 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 		 */
 		free_page_and_swap_cache(subpage);
 	}
+
+	if (free_folios.nr) {
+		mem_cgroup_uncharge_folios(&free_folios);
+		free_unref_folios(&free_folios);
+	}
 }
 
 /* Racy check whether the huge page can be split */