diff mbox series

[v3,3/3] mm: Batch-zap large anonymous folio PTE mappings

Message ID 20230720112955.643283-4-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series Optimize large folio interaction with deferred split | expand

Commit Message

Ryan Roberts July 20, 2023, 11:29 a.m. UTC
This allows batching the rmap removal with folio_remove_rmap_range(),
which means we avoid spuriously adding a partially unmapped folio to the
deferred split queue in the common case, which reduces split queue lock
contention.

Previously each page was removed from the rmap individually with
page_remove_rmap(). If the first page belonged to a large folio, this
would cause page_remove_rmap() to conclude that the folio was now
partially mapped and add the folio to the deferred split queue. But
subsequent calls would cause the folio to become fully unmapped, meaning
there is no value to adding it to the split queue.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

Comments

Nathan Chancellor July 26, 2023, 4:19 p.m. UTC | #1
Hi Ryan,

On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
> This allows batching the rmap removal with folio_remove_rmap_range(),
> which means we avoid spuriously adding a partially unmapped folio to the
> deferred split queue in the common case, which reduces split queue lock
> contention.
> 
> Previously each page was removed from the rmap individually with
> page_remove_rmap(). If the first page belonged to a large folio, this
> would cause page_remove_rmap() to conclude that the folio was now
> partially mapped and add the folio to the deferred split queue. But
> subsequent calls would cause the folio to become fully unmapped, meaning
> there is no value to adding it to the split queue.
> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 120 insertions(+)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 01f39e8144ef..189b1cfd823d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1391,6 +1391,94 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>  	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
>  }
>  
> +static inline unsigned long page_cont_mapped_vaddr(struct page *page,
> +				struct page *anchor, unsigned long anchor_vaddr)
> +{
> +	unsigned long offset;
> +	unsigned long vaddr;
> +
> +	offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
> +	vaddr = anchor_vaddr + offset;
> +
> +	if (anchor > page) {
> +		if (vaddr > anchor_vaddr)
> +			return 0;
> +	} else {
> +		if (vaddr < anchor_vaddr)
> +			return ULONG_MAX;
> +	}
> +
> +	return vaddr;
> +}
> +
> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> +				      struct page *page, pte_t *pte,
> +				      unsigned long addr, unsigned long end)
> +{
> +	pte_t ptent;
> +	int floops;
> +	int i;
> +	unsigned long pfn;
> +	struct page *folio_end;
> +
> +	if (!folio_test_large(folio))
> +		return 1;
> +
> +	folio_end = &folio->page + folio_nr_pages(folio);
> +	end = min(page_cont_mapped_vaddr(folio_end, page, addr), end);
> +	floops = (end - addr) >> PAGE_SHIFT;
> +	pfn = page_to_pfn(page);
> +	pfn++;
> +	pte++;
> +
> +	for (i = 1; i < floops; i++) {
> +		ptent = ptep_get(pte);
> +
> +		if (!pte_present(ptent) || pte_pfn(ptent) != pfn)
> +			break;
> +
> +		pfn++;
> +		pte++;
> +	}
> +
> +	return i;
> +}
> +
> +static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb,
> +					    struct vm_area_struct *vma,
> +					    struct folio *folio,
> +					    struct page *page, pte_t *pte,
> +					    unsigned long addr, int nr_pages,
> +					    struct zap_details *details)
> +{
> +	struct mm_struct *mm = tlb->mm;
> +	pte_t ptent;
> +	bool full;
> +	int i;
> +
> +	for (i = 0; i < nr_pages;) {
> +		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
> +		tlb_remove_tlb_entry(tlb, pte, addr);
> +		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
> +		full = __tlb_remove_page(tlb, page, 0);
> +
> +		if (unlikely(page_mapcount(page) < 1))
> +			print_bad_pte(vma, addr, ptent, page);
> +
> +		i++;
> +		page++;
> +		pte++;
> +		addr += PAGE_SIZE;
> +
> +		if (unlikely(full))
> +			break;
> +	}
> +
> +	folio_remove_rmap_range(folio, page - i, i, vma);
> +
> +	return i;
> +}
> +
>  static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  				struct vm_area_struct *vma, pmd_t *pmd,
>  				unsigned long addr, unsigned long end,
> @@ -1428,6 +1516,38 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			page = vm_normal_page(vma, addr, ptent);
>  			if (unlikely(!should_zap_page(details, page)))
>  				continue;
> +
> +			/*
> +			 * Batch zap large anonymous folio mappings. This allows
> +			 * batching the rmap removal, which means we avoid
> +			 * spuriously adding a partially unmapped folio to the
> +			 * deferrred split queue in the common case, which
> +			 * reduces split queue lock contention.
> +			 */
> +			if (page && PageAnon(page)) {
> +				struct folio *folio = page_folio(page);
> +				int nr_pages_req, nr_pages;
> +
> +				nr_pages_req = folio_nr_pages_cont_mapped(
> +						folio, page, pte, addr, end);
> +
> +				nr_pages = try_zap_anon_pte_range(tlb, vma,
> +						folio, page, pte, addr,
> +						nr_pages_req, details);
> +
> +				rss[mm_counter(page)] -= nr_pages;
> +				nr_pages--;
> +				pte += nr_pages;
> +				addr += nr_pages << PAGE_SHIFT;
> +
> +				if (unlikely(nr_pages < nr_pages_req)) {
> +					force_flush = 1;
> +					addr += PAGE_SIZE;
> +					break;
> +				}
> +				continue;
> +			}
> +
>  			ptent = ptep_get_and_clear_full(mm, addr, pte,
>  							tlb->fullmm);
>  			tlb_remove_tlb_entry(tlb, pte, addr);
> -- 
> 2.25.1
> 

After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
anonymous folio PTE mappings"), I see the following splats several times
when booting Debian's s390x configuration (which I have mirrored at [1])
in QEMU (bisect log below):

$ qemu-system-s390x \
    -display none \
    -nodefaults \
    -M s390-ccw-virtio \
    -kernel arch/s390/boot/bzImage \
    -initrd rootfs.cpio \
    -m 512m \
    -serial mon:stdio
KASLR disabled: CPU has no PRNG
KASLR disabled: CPU has no PRNG
[    2.502282] Linux version 6.5.0-rc3+ (nathan@dev-arch.thelio-3990X) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 09:14:20 MST 2023
...
[    3.406011] Freeing initrd memory: 7004K
[    3.492739] BUG: Bad page state in process modprobe  pfn:01b18
[    3.492909] page:00000000233d9f2f refcount:0 mapcount:1 mapping:0000000000000000 index:0xdb pfn:0x1b18
[    3.492998] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0)
[    3.493195] page_type: 0x0()
[    3.493457] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000
[    3.493492] raw: 00000000000000db 0000000000000000 0000000000000000 0000000000000000
[    3.493525] page dumped because: nonzero mapcount
[    3.493549] Modules linked in:
[    3.493719] CPU: 0 PID: 38 Comm: modprobe Not tainted 6.5.0-rc3+ #1
[    3.493814] Hardware name: QEMU 8561 QEMU (KVM/Linux)
[    3.493892] Call Trace:
[    3.494117]  [<0000000000add35a>] dump_stack_lvl+0x62/0x88
[    3.494333]  [<00000000003d565a>] bad_page+0x8a/0x130
[    3.494355]  [<00000000003d6728>] free_unref_page_prepare+0x268/0x3d8
[    3.494375]  [<00000000003d9408>] free_unref_page+0x48/0x140
[    3.494394]  [<00000000003ad99c>] unmap_page_range+0x924/0x1388
[    3.494412]  [<00000000003ae54c>] unmap_vmas+0x14c/0x200
[    3.494429]  [<00000000003be2f2>] exit_mmap+0xba/0x3a0
[    3.494447]  [<0000000000147000>] __mmput+0x50/0x180
[    3.494466]  [<0000000000152a00>] do_exit+0x320/0xb40
[    3.494484]  [<0000000000153450>] do_group_exit+0x40/0xb8
[    3.494502]  [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30
[    3.494520]  [<0000000000b05080>] __do_syscall+0x1e8/0x210
[    3.494539]  [<0000000000b15970>] system_call+0x70/0x98
[    3.494663] Disabling lock debugging due to kernel taint
[    3.494809] BUG: Bad page map in process modprobe  pte:01b1831f pmd:1fff9000
[    3.494833] page:00000000233d9f2f refcount:0 mapcount:0 mapping:0000000000000000 index:0xdb pfn:0x1b18
[    3.494852] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0)
[    3.494866] page_type: 0xffffffff()
[    3.494882] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000
[    3.494898] raw: 00000000000000db 0000000000000000 ffffffff00000000 0000000000000000
[    3.494908] page dumped because: bad pte
[    3.494923] addr:000002aa1d75c000 vm_flags:08100071 anon_vma:000000001fffc340 mapping:000000000286d6b8 index:db
[    3.494983] file:busybox fault:shmem_fault mmap:shmem_mmap read_folio:0x0
[    3.495247] CPU: 0 PID: 38 Comm: modprobe Tainted: G    B              6.5.0-rc3+ #1
[    3.495267] Hardware name: QEMU 8561 QEMU (KVM/Linux)
[    3.495277] Call Trace:
[    3.495285]  [<0000000000add35a>] dump_stack_lvl+0x62/0x88
[    3.495307]  [<00000000003ab30e>] print_bad_pte+0x176/0x2c8
[    3.495324]  [<00000000003ae098>] unmap_page_range+0x1020/0x1388
[    3.495341]  [<00000000003ae54c>] unmap_vmas+0x14c/0x200
[    3.495357]  [<00000000003be2f2>] exit_mmap+0xba/0x3a0
[    3.495375]  [<0000000000147000>] __mmput+0x50/0x180
[    3.495394]  [<0000000000152a00>] do_exit+0x320/0xb40
[    3.495411]  [<0000000000153450>] do_group_exit+0x40/0xb8
[    3.495429]  [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30
[    3.495447]  [<0000000000b05080>] __do_syscall+0x1e8/0x210
[    3.495465]  [<0000000000b15970>] system_call+0x70/0x98
...

The rootfs is available at [2] if it is relevant. I am happy to provide
any additional information or test patches as necessary.

Cheers,
Nathan

[1]: https://github.com/nathanchance/llvm-kernel-testing/blob/79aa4ab2edc595979366c427cb49f477c7f31c68/configs/debian/s390x.config
[2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst

# bad: [0ba5d07205771c50789fd9063950aa75e7f1183f] Add linux-next specific files for 20230726
# good: [18b44bc5a67275641fb26f2c54ba7eef80ac5950] ovl: Always reevaluate the file signature for IMA
git bisect start '0ba5d07205771c50789fd9063950aa75e7f1183f' '18b44bc5a67275641fb26f2c54ba7eef80ac5950'
# bad: [8fe1b33ece8f8fe1377082e839817886cb8c0f81] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
git bisect bad 8fe1b33ece8f8fe1377082e839817886cb8c0f81
# bad: [932bd67958459da3dc755b5bea7758e9d951dee5] Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git
git bisect bad 932bd67958459da3dc755b5bea7758e9d951dee5
# bad: [a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97] Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git
git bisect bad a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97
# bad: [5a52022bde252d090e051077af297dcfeff9fd0d] powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation
git bisect bad 5a52022bde252d090e051077af297dcfeff9fd0d
# good: [671115657ee2403d18cb849061d7245687d9fdc5] mm/pgtable: notes on pte_offset_map[_lock]()
git bisect good 671115657ee2403d18cb849061d7245687d9fdc5
# good: [26c3a4fe0eb027ff00ad42168c8732db0c0b40d7] arm64/smmu: use TLBI ASID when invalidating entire range
git bisect good 26c3a4fe0eb027ff00ad42168c8732db0c0b40d7
# bad: [8585d0b53780f11cad8dad37997369949e3d5043] mm: memcg: use rstat for non-hierarchical stats
git bisect bad 8585d0b53780f11cad8dad37997369949e3d5043
# bad: [9abfe35eb187c3f79af5bb07c2f9815a480c4965] mm/compaction: correct comment of candidate pfn in fast_isolate_freepages
git bisect bad 9abfe35eb187c3f79af5bb07c2f9815a480c4965
# bad: [208f64c37a4e22b25b8037776c5713545eaf54fa] selftests: line buffer test program's stdout
git bisect bad 208f64c37a4e22b25b8037776c5713545eaf54fa
# good: [08356142587c28b86817646ff318317b5237fdeb] mmu_notifiers: rename invalidate_range notifier
git bisect good 08356142587c28b86817646ff318317b5237fdeb
# good: [652555287069f2c0bbbfaf262eb41638f5c87550] mm: allow deferred splitting of arbitrary large anon folios
git bisect good 652555287069f2c0bbbfaf262eb41638f5c87550
# bad: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings
git bisect bad 904d9713b3b0e64329b2f6d159966b5c737444ff
# good: [9a7c14665a566fbc1adc2c35982898abc1546525] mm: implement folio_remove_rmap_range()
git bisect good 9a7c14665a566fbc1adc2c35982898abc1546525
# first bad commit: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings
Ryan Roberts July 26, 2023, 4:32 p.m. UTC | #2
On 26/07/2023 17:19, Nathan Chancellor wrote:
> Hi Ryan,
> 
> On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
>> This allows batching the rmap removal with folio_remove_rmap_range(),
>> which means we avoid spuriously adding a partially unmapped folio to the
>> deferred split queue in the common case, which reduces split queue lock
>> contention.
>>
>> Previously each page was removed from the rmap individually with
>> page_remove_rmap(). If the first page belonged to a large folio, this
>> would cause page_remove_rmap() to conclude that the folio was now
>> partially mapped and add the folio to the deferred split queue. But
>> subsequent calls would cause the folio to become fully unmapped, meaning
>> there is no value to adding it to the split queue.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>  mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 120 insertions(+)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 01f39e8144ef..189b1cfd823d 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1391,6 +1391,94 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>>  	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
>>  }
>>  
>> +static inline unsigned long page_cont_mapped_vaddr(struct page *page,
>> +				struct page *anchor, unsigned long anchor_vaddr)
>> +{
>> +	unsigned long offset;
>> +	unsigned long vaddr;
>> +
>> +	offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
>> +	vaddr = anchor_vaddr + offset;
>> +
>> +	if (anchor > page) {
>> +		if (vaddr > anchor_vaddr)
>> +			return 0;
>> +	} else {
>> +		if (vaddr < anchor_vaddr)
>> +			return ULONG_MAX;
>> +	}
>> +
>> +	return vaddr;
>> +}
>> +
>> +static int folio_nr_pages_cont_mapped(struct folio *folio,
>> +				      struct page *page, pte_t *pte,
>> +				      unsigned long addr, unsigned long end)
>> +{
>> +	pte_t ptent;
>> +	int floops;
>> +	int i;
>> +	unsigned long pfn;
>> +	struct page *folio_end;
>> +
>> +	if (!folio_test_large(folio))
>> +		return 1;
>> +
>> +	folio_end = &folio->page + folio_nr_pages(folio);
>> +	end = min(page_cont_mapped_vaddr(folio_end, page, addr), end);
>> +	floops = (end - addr) >> PAGE_SHIFT;
>> +	pfn = page_to_pfn(page);
>> +	pfn++;
>> +	pte++;
>> +
>> +	for (i = 1; i < floops; i++) {
>> +		ptent = ptep_get(pte);
>> +
>> +		if (!pte_present(ptent) || pte_pfn(ptent) != pfn)
>> +			break;
>> +
>> +		pfn++;
>> +		pte++;
>> +	}
>> +
>> +	return i;
>> +}
>> +
>> +static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb,
>> +					    struct vm_area_struct *vma,
>> +					    struct folio *folio,
>> +					    struct page *page, pte_t *pte,
>> +					    unsigned long addr, int nr_pages,
>> +					    struct zap_details *details)
>> +{
>> +	struct mm_struct *mm = tlb->mm;
>> +	pte_t ptent;
>> +	bool full;
>> +	int i;
>> +
>> +	for (i = 0; i < nr_pages;) {
>> +		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
>> +		tlb_remove_tlb_entry(tlb, pte, addr);
>> +		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
>> +		full = __tlb_remove_page(tlb, page, 0);
>> +
>> +		if (unlikely(page_mapcount(page) < 1))
>> +			print_bad_pte(vma, addr, ptent, page);
>> +
>> +		i++;
>> +		page++;
>> +		pte++;
>> +		addr += PAGE_SIZE;
>> +
>> +		if (unlikely(full))
>> +			break;
>> +	}
>> +
>> +	folio_remove_rmap_range(folio, page - i, i, vma);
>> +
>> +	return i;
>> +}
>> +
>>  static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>  				struct vm_area_struct *vma, pmd_t *pmd,
>>  				unsigned long addr, unsigned long end,
>> @@ -1428,6 +1516,38 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>  			page = vm_normal_page(vma, addr, ptent);
>>  			if (unlikely(!should_zap_page(details, page)))
>>  				continue;
>> +
>> +			/*
>> +			 * Batch zap large anonymous folio mappings. This allows
>> +			 * batching the rmap removal, which means we avoid
>> +			 * spuriously adding a partially unmapped folio to the
>> +			 * deferrred split queue in the common case, which
>> +			 * reduces split queue lock contention.
>> +			 */
>> +			if (page && PageAnon(page)) {
>> +				struct folio *folio = page_folio(page);
>> +				int nr_pages_req, nr_pages;
>> +
>> +				nr_pages_req = folio_nr_pages_cont_mapped(
>> +						folio, page, pte, addr, end);
>> +
>> +				nr_pages = try_zap_anon_pte_range(tlb, vma,
>> +						folio, page, pte, addr,
>> +						nr_pages_req, details);
>> +
>> +				rss[mm_counter(page)] -= nr_pages;
>> +				nr_pages--;
>> +				pte += nr_pages;
>> +				addr += nr_pages << PAGE_SHIFT;
>> +
>> +				if (unlikely(nr_pages < nr_pages_req)) {
>> +					force_flush = 1;
>> +					addr += PAGE_SIZE;
>> +					break;
>> +				}
>> +				continue;
>> +			}
>> +
>>  			ptent = ptep_get_and_clear_full(mm, addr, pte,
>>  							tlb->fullmm);
>>  			tlb_remove_tlb_entry(tlb, pte, addr);
>> -- 
>> 2.25.1
>>
> 
> After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
> anonymous folio PTE mappings"), I see the following splats several times
> when booting Debian's s390x configuration (which I have mirrored at [1])
> in QEMU (bisect log below):

Thanks for the bug report and sorry for the inconvenience. I'm going to need a
little time to figure out a build environment for s390x and get it reproducing.
Hopefully I can come back to you tomorrow with a fix.

Thanks,
Ryan
Nathan Chancellor July 26, 2023, 4:35 p.m. UTC | #3
On Wed, Jul 26, 2023 at 05:32:07PM +0100, Ryan Roberts wrote:
> On 26/07/2023 17:19, Nathan Chancellor wrote:
> > Hi Ryan,
> > 
> > On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
> >> This allows batching the rmap removal with folio_remove_rmap_range(),
> >> which means we avoid spuriously adding a partially unmapped folio to the
> >> deferred split queue in the common case, which reduces split queue lock
> >> contention.
> >>
> >> Previously each page was removed from the rmap individually with
> >> page_remove_rmap(). If the first page belonged to a large folio, this
> >> would cause page_remove_rmap() to conclude that the folio was now
> >> partially mapped and add the folio to the deferred split queue. But
> >> subsequent calls would cause the folio to become fully unmapped, meaning
> >> there is no value to adding it to the split queue.
> >>
> >> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >> ---
> >>  mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 120 insertions(+)
> >>
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 01f39e8144ef..189b1cfd823d 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -1391,6 +1391,94 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> >>  	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
> >>  }
> >>  
> >> +static inline unsigned long page_cont_mapped_vaddr(struct page *page,
> >> +				struct page *anchor, unsigned long anchor_vaddr)
> >> +{
> >> +	unsigned long offset;
> >> +	unsigned long vaddr;
> >> +
> >> +	offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
> >> +	vaddr = anchor_vaddr + offset;
> >> +
> >> +	if (anchor > page) {
> >> +		if (vaddr > anchor_vaddr)
> >> +			return 0;
> >> +	} else {
> >> +		if (vaddr < anchor_vaddr)
> >> +			return ULONG_MAX;
> >> +	}
> >> +
> >> +	return vaddr;
> >> +}
> >> +
> >> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> >> +				      struct page *page, pte_t *pte,
> >> +				      unsigned long addr, unsigned long end)
> >> +{
> >> +	pte_t ptent;
> >> +	int floops;
> >> +	int i;
> >> +	unsigned long pfn;
> >> +	struct page *folio_end;
> >> +
> >> +	if (!folio_test_large(folio))
> >> +		return 1;
> >> +
> >> +	folio_end = &folio->page + folio_nr_pages(folio);
> >> +	end = min(page_cont_mapped_vaddr(folio_end, page, addr), end);
> >> +	floops = (end - addr) >> PAGE_SHIFT;
> >> +	pfn = page_to_pfn(page);
> >> +	pfn++;
> >> +	pte++;
> >> +
> >> +	for (i = 1; i < floops; i++) {
> >> +		ptent = ptep_get(pte);
> >> +
> >> +		if (!pte_present(ptent) || pte_pfn(ptent) != pfn)
> >> +			break;
> >> +
> >> +		pfn++;
> >> +		pte++;
> >> +	}
> >> +
> >> +	return i;
> >> +}
> >> +
> >> +static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb,
> >> +					    struct vm_area_struct *vma,
> >> +					    struct folio *folio,
> >> +					    struct page *page, pte_t *pte,
> >> +					    unsigned long addr, int nr_pages,
> >> +					    struct zap_details *details)
> >> +{
> >> +	struct mm_struct *mm = tlb->mm;
> >> +	pte_t ptent;
> >> +	bool full;
> >> +	int i;
> >> +
> >> +	for (i = 0; i < nr_pages;) {
> >> +		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
> >> +		tlb_remove_tlb_entry(tlb, pte, addr);
> >> +		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
> >> +		full = __tlb_remove_page(tlb, page, 0);
> >> +
> >> +		if (unlikely(page_mapcount(page) < 1))
> >> +			print_bad_pte(vma, addr, ptent, page);
> >> +
> >> +		i++;
> >> +		page++;
> >> +		pte++;
> >> +		addr += PAGE_SIZE;
> >> +
> >> +		if (unlikely(full))
> >> +			break;
> >> +	}
> >> +
> >> +	folio_remove_rmap_range(folio, page - i, i, vma);
> >> +
> >> +	return i;
> >> +}
> >> +
> >>  static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >>  				struct vm_area_struct *vma, pmd_t *pmd,
> >>  				unsigned long addr, unsigned long end,
> >> @@ -1428,6 +1516,38 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >>  			page = vm_normal_page(vma, addr, ptent);
> >>  			if (unlikely(!should_zap_page(details, page)))
> >>  				continue;
> >> +
> >> +			/*
> >> +			 * Batch zap large anonymous folio mappings. This allows
> >> +			 * batching the rmap removal, which means we avoid
> >> +			 * spuriously adding a partially unmapped folio to the
> >> +			 * deferrred split queue in the common case, which
> >> +			 * reduces split queue lock contention.
> >> +			 */
> >> +			if (page && PageAnon(page)) {
> >> +				struct folio *folio = page_folio(page);
> >> +				int nr_pages_req, nr_pages;
> >> +
> >> +				nr_pages_req = folio_nr_pages_cont_mapped(
> >> +						folio, page, pte, addr, end);
> >> +
> >> +				nr_pages = try_zap_anon_pte_range(tlb, vma,
> >> +						folio, page, pte, addr,
> >> +						nr_pages_req, details);
> >> +
> >> +				rss[mm_counter(page)] -= nr_pages;
> >> +				nr_pages--;
> >> +				pte += nr_pages;
> >> +				addr += nr_pages << PAGE_SHIFT;
> >> +
> >> +				if (unlikely(nr_pages < nr_pages_req)) {
> >> +					force_flush = 1;
> >> +					addr += PAGE_SIZE;
> >> +					break;
> >> +				}
> >> +				continue;
> >> +			}
> >> +
> >>  			ptent = ptep_get_and_clear_full(mm, addr, pte,
> >>  							tlb->fullmm);
> >>  			tlb_remove_tlb_entry(tlb, pte, addr);
> >> -- 
> >> 2.25.1
> >>
> > 
> > After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
> > anonymous folio PTE mappings"), I see the following splats several times
> > when booting Debian's s390x configuration (which I have mirrored at [1])
> > in QEMU (bisect log below):
> 
> Thanks for the bug report and sorry for the inconvenience. I'm going to need a
> little time to figure out a build environment for s390x and get it reproducing.
> Hopefully I can come back to you tomorrow with a fix.

No worries! For what it's worth, if you are not already aware of it,
there are GCC toolchains on kernel.org, which is what I use in general
and in this particular case:

https://kernel.org/pub/tools/crosstool/

You can just download them to somewhere on your drive then use
CROSS_COMPILE=.../bin/s390-linux-gnu-, rather than downloading a bunch
of distribution packages.

Cheers,
Nathan

For what it's worth, I just use the GCC toolchains that are on
kernel.org:
Ryan Roberts July 26, 2023, 7:38 p.m. UTC | #4
On 26/07/2023 17:19, Nathan Chancellor wrote:
> Hi Ryan,
> 
> On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:

...

>>
> 
> After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
> anonymous folio PTE mappings"), I see the following splats several times
> when booting Debian's s390x configuration (which I have mirrored at [1])
> in QEMU (bisect log below):
> 
> $ qemu-system-s390x \
>     -display none \
>     -nodefaults \
>     -M s390-ccw-virtio \
>     -kernel arch/s390/boot/bzImage \
>     -initrd rootfs.cpio \
>     -m 512m \
>     -serial mon:stdio

I'm compiling the kernel for next-20230726 using the s390 cross compiler from kernel.org and the config you linked. Then booting with qemu-system-s390x (tried both distro's 4.2.1 and locally built 8.0.3) and the initrd you provided (tried passing it compressed and uncompressed), but I'm always getting a kernel panic due to not finding a rootfs:

$ qemu-system-s390x \
	-display none \
	-nodefaults \
	-M s390-ccw-virtio \
	-kernel arch/s390/boot/bzImage
	-initrd ../s390-rootfs.cpio.zst \
	-m 512m \
	-serial mon:stdio
KASLR disabled: CPU has no PRNG
KASLR disabled: CPU has no PRNG
Linux version 6.5.0-rc3-next-20230726 (ryarob01@e125769) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 19:56:26 BST 2023
setup: Linux is running under KVM in 64-bit mode
setup: The maximum memory size is 512MB
setup: Relocating AMODE31 section of size 0x00003000
cpu: 1 configured CPUs, 0 standby CPUs
Write protected kernel read-only data: 4036k
Zone ranges:
  DMA      [mem 0x0000000000000000-0x000000007fffffff]
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x0000000000000000-0x000000001fffffff]
Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
percpu: Embedded 14 pages/cpu s26368 r0 d30976 u57344
Kernel command line: 
random: crng init done
Dentry cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
Inode-cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
Built 1 zonelists, mobility grouping on.  Total pages: 129024
mem auto-init: stack:all(zero), heap alloc:off, heap free:off
Memory: 507720K/524288K available (3464K kernel code, 788K rwdata, 572K rodata, 796K init, 400K bss, 16568K reserved, 0K cma-reserved)
SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
rcu: Hierarchical RCU implementation.
rcu:    RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=1.
rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
NR_IRQS: 3, nr_irqs: 3, preallocated irqs: 3
rcu: srcu_init: Setting srcu_struct sizes based on contention.
clocksource: tod: mask: 0xffffffffffffffff max_cycles: 0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
Console: colour dummy device 80x25
printk: console [ttysclp0] enabled
pid_max: default: 32768 minimum: 301
Mount-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
Mountpoint-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
rcu: Hierarchical SRCU implementation.
rcu:    Max phase no-delay instances is 1000.
smp: Bringing up secondary CPUs ...
smp: Brought up 1 node, 1 CPU
clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
futex hash table entries: 256 (order: 4, 65536 bytes, linear)
kvm-s390: SIE is not available
hypfs: The hardware system does not support hypfs
workingset: timestamp_bits=62 max_order=17 bucket_order=0
io scheduler mq-deadline registered
io scheduler kyber registered
cio: Channel measurement facility initialized using format extended (mode autodetected)
Discipline DIAG cannot be used without z/VM
vmur: The z/VM virtual unit record device driver cannot be loaded without z/VM
sclp_sd: Store Data request failed (eq=2, di=3, response=0x40f0, flags=0x00, status=0, rc=-5)
List of all partitions:
No filesystem could mount root, tried: 

Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(1,0)
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-next-20230726 #1
Hardware name: QEMU 8561 QEMU (KVM/Linux)
Call Trace:
 [<0000000000432b22>] dump_stack_lvl+0x62/0x80 
 [<0000000000158898>] panic+0x2f8/0x310 
 [<00000000005b7a56>] mount_root_generic+0x276/0x3b8 
 [<00000000005b7e46>] prepare_namespace+0x56/0x220 
 [<00000000004593e8>] kernel_init+0x28/0x1c8 
 [<000000000010217e>] __ret_from_fork+0x36/0x50 
 [<000000000046143a>] ret_from_fork+0xa/0x30


Any idea what I'm doing wrong here? Do I need to specify something on the kernel command line? (I've tried root=/dev/ram0, but get the same result).

Thanks,
Ryan
Nathan Chancellor July 26, 2023, 7:50 p.m. UTC | #5
On Wed, Jul 26, 2023 at 08:38:25PM +0100, Ryan Roberts wrote:
> On 26/07/2023 17:19, Nathan Chancellor wrote:
> > Hi Ryan,
> > 
> > On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
> 
> ...
> 
> >>
> > 
> > After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
> > anonymous folio PTE mappings"), I see the following splats several times
> > when booting Debian's s390x configuration (which I have mirrored at [1])
> > in QEMU (bisect log below):
> > 
> > $ qemu-system-s390x \
> >     -display none \
> >     -nodefaults \
> >     -M s390-ccw-virtio \
> >     -kernel arch/s390/boot/bzImage \
> >     -initrd rootfs.cpio \
> >     -m 512m \
> >     -serial mon:stdio
> 
> I'm compiling the kernel for next-20230726 using the s390 cross compiler from kernel.org and the config you linked. Then booting with qemu-system-s390x (tried both distro's 4.2.1 and locally built 8.0.3) and the initrd you provided (tried passing it compressed and uncompressed), but I'm always getting a kernel panic due to not finding a rootfs:
> 
> $ qemu-system-s390x \
> 	-display none \
> 	-nodefaults \
> 	-M s390-ccw-virtio \
> 	-kernel arch/s390/boot/bzImage
> 	-initrd ../s390-rootfs.cpio.zst \
> 	-m 512m \
> 	-serial mon:stdio
> KASLR disabled: CPU has no PRNG
> KASLR disabled: CPU has no PRNG
> Linux version 6.5.0-rc3-next-20230726 (ryarob01@e125769) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 19:56:26 BST 2023
> setup: Linux is running under KVM in 64-bit mode
> setup: The maximum memory size is 512MB
> setup: Relocating AMODE31 section of size 0x00003000
> cpu: 1 configured CPUs, 0 standby CPUs
> Write protected kernel read-only data: 4036k
> Zone ranges:
>   DMA      [mem 0x0000000000000000-0x000000007fffffff]
>   Normal   empty
> Movable zone start for each node
> Early memory node ranges
>   node   0: [mem 0x0000000000000000-0x000000001fffffff]
> Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
> percpu: Embedded 14 pages/cpu s26368 r0 d30976 u57344
> Kernel command line: 
> random: crng init done
> Dentry cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
> Inode-cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
> Built 1 zonelists, mobility grouping on.  Total pages: 129024
> mem auto-init: stack:all(zero), heap alloc:off, heap free:off
> Memory: 507720K/524288K available (3464K kernel code, 788K rwdata, 572K rodata, 796K init, 400K bss, 16568K reserved, 0K cma-reserved)
> SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
> rcu: Hierarchical RCU implementation.
> rcu:    RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=1.
> rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
> rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
> NR_IRQS: 3, nr_irqs: 3, preallocated irqs: 3
> rcu: srcu_init: Setting srcu_struct sizes based on contention.
> clocksource: tod: mask: 0xffffffffffffffff max_cycles: 0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
> Console: colour dummy device 80x25
> printk: console [ttysclp0] enabled
> pid_max: default: 32768 minimum: 301
> Mount-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
> Mountpoint-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
> rcu: Hierarchical SRCU implementation.
> rcu:    Max phase no-delay instances is 1000.
> smp: Bringing up secondary CPUs ...
> smp: Brought up 1 node, 1 CPU
> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
> futex hash table entries: 256 (order: 4, 65536 bytes, linear)
> kvm-s390: SIE is not available
> hypfs: The hardware system does not support hypfs
> workingset: timestamp_bits=62 max_order=17 bucket_order=0
> io scheduler mq-deadline registered
> io scheduler kyber registered
> cio: Channel measurement facility initialized using format extended (mode autodetected)
> Discipline DIAG cannot be used without z/VM
> vmur: The z/VM virtual unit record device driver cannot be loaded without z/VM
> sclp_sd: Store Data request failed (eq=2, di=3, response=0x40f0, flags=0x00, status=0, rc=-5)
> List of all partitions:
> No filesystem could mount root, tried: 
> 
> Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(1,0)
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-next-20230726 #1
> Hardware name: QEMU 8561 QEMU (KVM/Linux)
> Call Trace:
>  [<0000000000432b22>] dump_stack_lvl+0x62/0x80 
>  [<0000000000158898>] panic+0x2f8/0x310 
>  [<00000000005b7a56>] mount_root_generic+0x276/0x3b8 
>  [<00000000005b7e46>] prepare_namespace+0x56/0x220 
>  [<00000000004593e8>] kernel_init+0x28/0x1c8 
>  [<000000000010217e>] __ret_from_fork+0x36/0x50 
>  [<000000000046143a>] ret_from_fork+0xa/0x30
> 
> 
> Any idea what I'm doing wrong here? Do I need to specify something on the kernel command line? (I've tried root=/dev/ram0, but get the same result).

Hmmm, interesting. The rootfs does need to be used decompressed (I think
the kernel does support zstd compressed initrds but we only compress
them to save space, not for running). Does the sha256sum sum match the
one I just tested?

$ curl -LSsO https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst
$ zstd -d s390-rootfs.cpio.zst
$ sha256sum s390-rootfs.cpio
948fb3c2ad65e26aee8eb0a069f5c9e1ab2c59e4b4f62b63ead271e12a8479b4  s390-rootfs.cpio
$ qemu-system-s390x -display none -nodefaults -M s390-ccw-virtio -kernel arch/s390/boot/bzImage -initrd s390-rootfs.cpio -m 512m -serial mon:stdio
...
[    7.890385] Trying to unpack rootfs image as initramfs...
...

I suppose it could be something with Kconfig too, here is the actual one
that olddefconfig produces for me:

https://gist.github.com/nathanchance/3e4c1721ac204bbb969e2f288e1695c9

Cheers,
Nathan
Ryan Roberts July 26, 2023, 9:17 p.m. UTC | #6
On 26/07/2023 20:50, Nathan Chancellor wrote:
> On Wed, Jul 26, 2023 at 08:38:25PM +0100, Ryan Roberts wrote:
>> On 26/07/2023 17:19, Nathan Chancellor wrote:
>>> Hi Ryan,
>>>
>>> On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
>>
>> ...
>>
>>>>
>>>
>>> After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
>>> anonymous folio PTE mappings"), I see the following splats several times
>>> when booting Debian's s390x configuration (which I have mirrored at [1])
>>> in QEMU (bisect log below):
>>>
>>> $ qemu-system-s390x \
>>>     -display none \
>>>     -nodefaults \
>>>     -M s390-ccw-virtio \
>>>     -kernel arch/s390/boot/bzImage \
>>>     -initrd rootfs.cpio \
>>>     -m 512m \
>>>     -serial mon:stdio
>>
>> I'm compiling the kernel for next-20230726 using the s390 cross compiler from kernel.org and the config you linked. Then booting with qemu-system-s390x (tried both distro's 4.2.1 and locally built 8.0.3) and the initrd you provided (tried passing it compressed and uncompressed), but I'm always getting a kernel panic due to not finding a rootfs:
>>
>> $ qemu-system-s390x \
>> 	-display none \
>> 	-nodefaults \
>> 	-M s390-ccw-virtio \
>> 	-kernel arch/s390/boot/bzImage
>> 	-initrd ../s390-rootfs.cpio.zst \
>> 	-m 512m \
>> 	-serial mon:stdio
>> KASLR disabled: CPU has no PRNG
>> KASLR disabled: CPU has no PRNG
>> Linux version 6.5.0-rc3-next-20230726 (ryarob01@e125769) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 19:56:26 BST 2023
>> setup: Linux is running under KVM in 64-bit mode
>> setup: The maximum memory size is 512MB
>> setup: Relocating AMODE31 section of size 0x00003000
>> cpu: 1 configured CPUs, 0 standby CPUs
>> Write protected kernel read-only data: 4036k
>> Zone ranges:
>>   DMA      [mem 0x0000000000000000-0x000000007fffffff]
>>   Normal   empty
>> Movable zone start for each node
>> Early memory node ranges
>>   node   0: [mem 0x0000000000000000-0x000000001fffffff]
>> Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
>> percpu: Embedded 14 pages/cpu s26368 r0 d30976 u57344
>> Kernel command line: 
>> random: crng init done
>> Dentry cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
>> Inode-cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
>> Built 1 zonelists, mobility grouping on.  Total pages: 129024
>> mem auto-init: stack:all(zero), heap alloc:off, heap free:off
>> Memory: 507720K/524288K available (3464K kernel code, 788K rwdata, 572K rodata, 796K init, 400K bss, 16568K reserved, 0K cma-reserved)
>> SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
>> rcu: Hierarchical RCU implementation.
>> rcu:    RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=1.
>> rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
>> rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
>> NR_IRQS: 3, nr_irqs: 3, preallocated irqs: 3
>> rcu: srcu_init: Setting srcu_struct sizes based on contention.
>> clocksource: tod: mask: 0xffffffffffffffff max_cycles: 0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
>> Console: colour dummy device 80x25
>> printk: console [ttysclp0] enabled
>> pid_max: default: 32768 minimum: 301
>> Mount-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
>> Mountpoint-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
>> rcu: Hierarchical SRCU implementation.
>> rcu:    Max phase no-delay instances is 1000.
>> smp: Bringing up secondary CPUs ...
>> smp: Brought up 1 node, 1 CPU
>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
>> futex hash table entries: 256 (order: 4, 65536 bytes, linear)
>> kvm-s390: SIE is not available
>> hypfs: The hardware system does not support hypfs
>> workingset: timestamp_bits=62 max_order=17 bucket_order=0
>> io scheduler mq-deadline registered
>> io scheduler kyber registered
>> cio: Channel measurement facility initialized using format extended (mode autodetected)
>> Discipline DIAG cannot be used without z/VM
>> vmur: The z/VM virtual unit record device driver cannot be loaded without z/VM
>> sclp_sd: Store Data request failed (eq=2, di=3, response=0x40f0, flags=0x00, status=0, rc=-5)
>> List of all partitions:
>> No filesystem could mount root, tried: 
>>
>> Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(1,0)
>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-next-20230726 #1
>> Hardware name: QEMU 8561 QEMU (KVM/Linux)
>> Call Trace:
>>  [<0000000000432b22>] dump_stack_lvl+0x62/0x80 
>>  [<0000000000158898>] panic+0x2f8/0x310 
>>  [<00000000005b7a56>] mount_root_generic+0x276/0x3b8 
>>  [<00000000005b7e46>] prepare_namespace+0x56/0x220 
>>  [<00000000004593e8>] kernel_init+0x28/0x1c8 
>>  [<000000000010217e>] __ret_from_fork+0x36/0x50 
>>  [<000000000046143a>] ret_from_fork+0xa/0x30
>>
>>
>> Any idea what I'm doing wrong here? Do I need to specify something on the kernel command line? (I've tried root=/dev/ram0, but get the same result).
> 
> Hmmm, interesting. The rootfs does need to be used decompressed (I think
> the kernel does support zstd compressed initrds but we only compress
> them to save space, not for running). Does the sha256sum sum match the
> one I just tested?
> 
> $ curl -LSsO https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst
> $ zstd -d s390-rootfs.cpio.zst
> $ sha256sum s390-rootfs.cpio
> 948fb3c2ad65e26aee8eb0a069f5c9e1ab2c59e4b4f62b63ead271e12a8479b4  s390-rootfs.cpio

Yes, this matches.

> $ qemu-system-s390x -display none -nodefaults -M s390-ccw-virtio -kernel arch/s390/boot/bzImage -initrd s390-rootfs.cpio -m 512m -serial mon:stdio
> ...
> [    7.890385] Trying to unpack rootfs image as initramfs...
> ...
> 
> I suppose it could be something with Kconfig too, here is the actual one
> that olddefconfig produces for me:
> 
> https://gist.github.com/nathanchance/3e4c1721ac204bbb969e2f288e1695c9

Ahh, I think this was the problem, after downloading this, the kernel is taking
much longer to compile and eventually giving me an assembler error:

arch/s390/kernel/mcount.S: Assembler messages:
arch/s390/kernel/mcount.S:140: Error: Unrecognized opcode: `aghik'
make[4]: *** [scripts/Makefile.build:360: arch/s390/kernel/mcount.o] Error 1
make[4]: *** Waiting for unfinished jobs....
make[3]: *** [scripts/Makefile.build:480: arch/s390/kernel] Error 2
make[2]: *** [scripts/Makefile.build:480: arch/s390] Error 2
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [/data_nvme0n1/ryarob01/granule_perf/linux/Makefile:2035: .] Error 2
make: *** [Makefile:234: __sub-make] Error 2

The assembler that comes with the kernel.org toolchain is from binutils 2.40.
Perhaps that's too old for this instruction? It's inside an "#ifdef
CONFIG_FUNCTION_GRAPH_TRACER" block, so I'm guessing whatever config I was
previously using didn't have that enabled. I'll try to disable some configs to
workaround. What assembler are you using?



> 
> Cheers,
> Nathan
Nathan Chancellor July 26, 2023, 9:23 p.m. UTC | #7
On Wed, Jul 26, 2023 at 10:17:18PM +0100, Ryan Roberts wrote:
> On 26/07/2023 20:50, Nathan Chancellor wrote:
> > On Wed, Jul 26, 2023 at 08:38:25PM +0100, Ryan Roberts wrote:
> >> On 26/07/2023 17:19, Nathan Chancellor wrote:
> >>> Hi Ryan,
> >>>
> >>> On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
> >>
> >> ...
> >>
> >>>>
> >>>
> >>> After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
> >>> anonymous folio PTE mappings"), I see the following splats several times
> >>> when booting Debian's s390x configuration (which I have mirrored at [1])
> >>> in QEMU (bisect log below):
> >>>
> >>> $ qemu-system-s390x \
> >>>     -display none \
> >>>     -nodefaults \
> >>>     -M s390-ccw-virtio \
> >>>     -kernel arch/s390/boot/bzImage \
> >>>     -initrd rootfs.cpio \
> >>>     -m 512m \
> >>>     -serial mon:stdio
> >>
> >> I'm compiling the kernel for next-20230726 using the s390 cross compiler from kernel.org and the config you linked. Then booting with qemu-system-s390x (tried both distro's 4.2.1 and locally built 8.0.3) and the initrd you provided (tried passing it compressed and uncompressed), but I'm always getting a kernel panic due to not finding a rootfs:
> >>
> >> $ qemu-system-s390x \
> >> 	-display none \
> >> 	-nodefaults \
> >> 	-M s390-ccw-virtio \
> >> 	-kernel arch/s390/boot/bzImage
> >> 	-initrd ../s390-rootfs.cpio.zst \
> >> 	-m 512m \
> >> 	-serial mon:stdio
> >> KASLR disabled: CPU has no PRNG
> >> KASLR disabled: CPU has no PRNG
> >> Linux version 6.5.0-rc3-next-20230726 (ryarob01@e125769) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 19:56:26 BST 2023
> >> setup: Linux is running under KVM in 64-bit mode
> >> setup: The maximum memory size is 512MB
> >> setup: Relocating AMODE31 section of size 0x00003000
> >> cpu: 1 configured CPUs, 0 standby CPUs
> >> Write protected kernel read-only data: 4036k
> >> Zone ranges:
> >>   DMA      [mem 0x0000000000000000-0x000000007fffffff]
> >>   Normal   empty
> >> Movable zone start for each node
> >> Early memory node ranges
> >>   node   0: [mem 0x0000000000000000-0x000000001fffffff]
> >> Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
> >> percpu: Embedded 14 pages/cpu s26368 r0 d30976 u57344
> >> Kernel command line: 
> >> random: crng init done
> >> Dentry cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
> >> Inode-cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
> >> Built 1 zonelists, mobility grouping on.  Total pages: 129024
> >> mem auto-init: stack:all(zero), heap alloc:off, heap free:off
> >> Memory: 507720K/524288K available (3464K kernel code, 788K rwdata, 572K rodata, 796K init, 400K bss, 16568K reserved, 0K cma-reserved)
> >> SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
> >> rcu: Hierarchical RCU implementation.
> >> rcu:    RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=1.
> >> rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
> >> rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
> >> NR_IRQS: 3, nr_irqs: 3, preallocated irqs: 3
> >> rcu: srcu_init: Setting srcu_struct sizes based on contention.
> >> clocksource: tod: mask: 0xffffffffffffffff max_cycles: 0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
> >> Console: colour dummy device 80x25
> >> printk: console [ttysclp0] enabled
> >> pid_max: default: 32768 minimum: 301
> >> Mount-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
> >> Mountpoint-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
> >> rcu: Hierarchical SRCU implementation.
> >> rcu:    Max phase no-delay instances is 1000.
> >> smp: Bringing up secondary CPUs ...
> >> smp: Brought up 1 node, 1 CPU
> >> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
> >> futex hash table entries: 256 (order: 4, 65536 bytes, linear)
> >> kvm-s390: SIE is not available
> >> hypfs: The hardware system does not support hypfs
> >> workingset: timestamp_bits=62 max_order=17 bucket_order=0
> >> io scheduler mq-deadline registered
> >> io scheduler kyber registered
> >> cio: Channel measurement facility initialized using format extended (mode autodetected)
> >> Discipline DIAG cannot be used without z/VM
> >> vmur: The z/VM virtual unit record device driver cannot be loaded without z/VM
> >> sclp_sd: Store Data request failed (eq=2, di=3, response=0x40f0, flags=0x00, status=0, rc=-5)
> >> List of all partitions:
> >> No filesystem could mount root, tried: 
> >>
> >> Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(1,0)
> >> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-next-20230726 #1
> >> Hardware name: QEMU 8561 QEMU (KVM/Linux)
> >> Call Trace:
> >>  [<0000000000432b22>] dump_stack_lvl+0x62/0x80 
> >>  [<0000000000158898>] panic+0x2f8/0x310 
> >>  [<00000000005b7a56>] mount_root_generic+0x276/0x3b8 
> >>  [<00000000005b7e46>] prepare_namespace+0x56/0x220 
> >>  [<00000000004593e8>] kernel_init+0x28/0x1c8 
> >>  [<000000000010217e>] __ret_from_fork+0x36/0x50 
> >>  [<000000000046143a>] ret_from_fork+0xa/0x30
> >>
> >>
> >> Any idea what I'm doing wrong here? Do I need to specify something on the kernel command line? (I've tried root=/dev/ram0, but get the same result).
> > 
> > Hmmm, interesting. The rootfs does need to be used decompressed (I think
> > the kernel does support zstd compressed initrds but we only compress
> > them to save space, not for running). Does the sha256sum sum match the
> > one I just tested?
> > 
> > $ curl -LSsO https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst
> > $ zstd -d s390-rootfs.cpio.zst
> > $ sha256sum s390-rootfs.cpio
> > 948fb3c2ad65e26aee8eb0a069f5c9e1ab2c59e4b4f62b63ead271e12a8479b4  s390-rootfs.cpio
> 
> Yes, this matches.
> 
> > $ qemu-system-s390x -display none -nodefaults -M s390-ccw-virtio -kernel arch/s390/boot/bzImage -initrd s390-rootfs.cpio -m 512m -serial mon:stdio
> > ...
> > [    7.890385] Trying to unpack rootfs image as initramfs...
> > ...
> > 
> > I suppose it could be something with Kconfig too, here is the actual one
> > that olddefconfig produces for me:
> > 
> > https://gist.github.com/nathanchance/3e4c1721ac204bbb969e2f288e1695c9
> 
> Ahh, I think this was the problem, after downloading this, the kernel is taking
> much longer to compile and eventually giving me an assembler error:
> 
> arch/s390/kernel/mcount.S: Assembler messages:
> arch/s390/kernel/mcount.S:140: Error: Unrecognized opcode: `aghik'
> make[4]: *** [scripts/Makefile.build:360: arch/s390/kernel/mcount.o] Error 1
> make[4]: *** Waiting for unfinished jobs....
> make[3]: *** [scripts/Makefile.build:480: arch/s390/kernel] Error 2
> make[2]: *** [scripts/Makefile.build:480: arch/s390] Error 2
> make[2]: *** Waiting for unfinished jobs....
> make[1]: *** [/data_nvme0n1/ryarob01/granule_perf/linux/Makefile:2035: .] Error 2
> make: *** [Makefile:234: __sub-make] Error 2
> 
> The assembler that comes with the kernel.org toolchain is from binutils 2.40.
> Perhaps that's too old for this instruction? It's inside an "#ifdef
> CONFIG_FUNCTION_GRAPH_TRACER" block, so I'm guessing whatever config I was
> previously using didn't have that enabled. I'll try to disable some configs to
> workaround. What assembler are you using?

Ah sorry about that, I forgot about that issue because I handled it
during my bisect and all the manual reproduction/verification I have
been doing in this thread has been against Andrew's -mm tree, which does
not have that problem. Apply this patch on top of next-20230726 and
everything should work...

https://lore.kernel.org/20230726061834.1300984-1-hca@linux.ibm.com/

Cheers,
Nathan
Ryan Roberts July 26, 2023, 9:28 p.m. UTC | #8
On 26/07/2023 22:23, Nathan Chancellor wrote:
> On Wed, Jul 26, 2023 at 10:17:18PM +0100, Ryan Roberts wrote:
>> On 26/07/2023 20:50, Nathan Chancellor wrote:
>>> On Wed, Jul 26, 2023 at 08:38:25PM +0100, Ryan Roberts wrote:
>>>> On 26/07/2023 17:19, Nathan Chancellor wrote:
>>>>> Hi Ryan,
>>>>>
>>>>> On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
>>>>
>>>> ...
>>>>
>>>>>>
>>>>>
>>>>> After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
>>>>> anonymous folio PTE mappings"), I see the following splats several times
>>>>> when booting Debian's s390x configuration (which I have mirrored at [1])
>>>>> in QEMU (bisect log below):
>>>>>
>>>>> $ qemu-system-s390x \
>>>>>     -display none \
>>>>>     -nodefaults \
>>>>>     -M s390-ccw-virtio \
>>>>>     -kernel arch/s390/boot/bzImage \
>>>>>     -initrd rootfs.cpio \
>>>>>     -m 512m \
>>>>>     -serial mon:stdio
>>>>
>>>> I'm compiling the kernel for next-20230726 using the s390 cross compiler from kernel.org and the config you linked. Then booting with qemu-system-s390x (tried both distro's 4.2.1 and locally built 8.0.3) and the initrd you provided (tried passing it compressed and uncompressed), but I'm always getting a kernel panic due to not finding a rootfs:
>>>>
>>>> $ qemu-system-s390x \
>>>> 	-display none \
>>>> 	-nodefaults \
>>>> 	-M s390-ccw-virtio \
>>>> 	-kernel arch/s390/boot/bzImage
>>>> 	-initrd ../s390-rootfs.cpio.zst \
>>>> 	-m 512m \
>>>> 	-serial mon:stdio
>>>> KASLR disabled: CPU has no PRNG
>>>> KASLR disabled: CPU has no PRNG
>>>> Linux version 6.5.0-rc3-next-20230726 (ryarob01@e125769) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 19:56:26 BST 2023
>>>> setup: Linux is running under KVM in 64-bit mode
>>>> setup: The maximum memory size is 512MB
>>>> setup: Relocating AMODE31 section of size 0x00003000
>>>> cpu: 1 configured CPUs, 0 standby CPUs
>>>> Write protected kernel read-only data: 4036k
>>>> Zone ranges:
>>>>   DMA      [mem 0x0000000000000000-0x000000007fffffff]
>>>>   Normal   empty
>>>> Movable zone start for each node
>>>> Early memory node ranges
>>>>   node   0: [mem 0x0000000000000000-0x000000001fffffff]
>>>> Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
>>>> percpu: Embedded 14 pages/cpu s26368 r0 d30976 u57344
>>>> Kernel command line: 
>>>> random: crng init done
>>>> Dentry cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
>>>> Inode-cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
>>>> Built 1 zonelists, mobility grouping on.  Total pages: 129024
>>>> mem auto-init: stack:all(zero), heap alloc:off, heap free:off
>>>> Memory: 507720K/524288K available (3464K kernel code, 788K rwdata, 572K rodata, 796K init, 400K bss, 16568K reserved, 0K cma-reserved)
>>>> SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
>>>> rcu: Hierarchical RCU implementation.
>>>> rcu:    RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=1.
>>>> rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
>>>> rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
>>>> NR_IRQS: 3, nr_irqs: 3, preallocated irqs: 3
>>>> rcu: srcu_init: Setting srcu_struct sizes based on contention.
>>>> clocksource: tod: mask: 0xffffffffffffffff max_cycles: 0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
>>>> Console: colour dummy device 80x25
>>>> printk: console [ttysclp0] enabled
>>>> pid_max: default: 32768 minimum: 301
>>>> Mount-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
>>>> Mountpoint-cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
>>>> rcu: Hierarchical SRCU implementation.
>>>> rcu:    Max phase no-delay instances is 1000.
>>>> smp: Bringing up secondary CPUs ...
>>>> smp: Brought up 1 node, 1 CPU
>>>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
>>>> futex hash table entries: 256 (order: 4, 65536 bytes, linear)
>>>> kvm-s390: SIE is not available
>>>> hypfs: The hardware system does not support hypfs
>>>> workingset: timestamp_bits=62 max_order=17 bucket_order=0
>>>> io scheduler mq-deadline registered
>>>> io scheduler kyber registered
>>>> cio: Channel measurement facility initialized using format extended (mode autodetected)
>>>> Discipline DIAG cannot be used without z/VM
>>>> vmur: The z/VM virtual unit record device driver cannot be loaded without z/VM
>>>> sclp_sd: Store Data request failed (eq=2, di=3, response=0x40f0, flags=0x00, status=0, rc=-5)
>>>> List of all partitions:
>>>> No filesystem could mount root, tried: 
>>>>
>>>> Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(1,0)
>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-next-20230726 #1
>>>> Hardware name: QEMU 8561 QEMU (KVM/Linux)
>>>> Call Trace:
>>>>  [<0000000000432b22>] dump_stack_lvl+0x62/0x80 
>>>>  [<0000000000158898>] panic+0x2f8/0x310 
>>>>  [<00000000005b7a56>] mount_root_generic+0x276/0x3b8 
>>>>  [<00000000005b7e46>] prepare_namespace+0x56/0x220 
>>>>  [<00000000004593e8>] kernel_init+0x28/0x1c8 
>>>>  [<000000000010217e>] __ret_from_fork+0x36/0x50 
>>>>  [<000000000046143a>] ret_from_fork+0xa/0x30
>>>>
>>>>
>>>> Any idea what I'm doing wrong here? Do I need to specify something on the kernel command line? (I've tried root=/dev/ram0, but get the same result).
>>>
>>> Hmmm, interesting. The rootfs does need to be used decompressed (I think
>>> the kernel does support zstd compressed initrds but we only compress
>>> them to save space, not for running). Does the sha256sum sum match the
>>> one I just tested?
>>>
>>> $ curl -LSsO https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst
>>> $ zstd -d s390-rootfs.cpio.zst
>>> $ sha256sum s390-rootfs.cpio
>>> 948fb3c2ad65e26aee8eb0a069f5c9e1ab2c59e4b4f62b63ead271e12a8479b4  s390-rootfs.cpio
>>
>> Yes, this matches.
>>
>>> $ qemu-system-s390x -display none -nodefaults -M s390-ccw-virtio -kernel arch/s390/boot/bzImage -initrd s390-rootfs.cpio -m 512m -serial mon:stdio
>>> ...
>>> [    7.890385] Trying to unpack rootfs image as initramfs...
>>> ...
>>>
>>> I suppose it could be something with Kconfig too, here is the actual one
>>> that olddefconfig produces for me:
>>>
>>> https://gist.github.com/nathanchance/3e4c1721ac204bbb969e2f288e1695c9
>>
>> Ahh, I think this was the problem, after downloading this, the kernel is taking
>> much longer to compile and eventually giving me an assembler error:
>>
>> arch/s390/kernel/mcount.S: Assembler messages:
>> arch/s390/kernel/mcount.S:140: Error: Unrecognized opcode: `aghik'
>> make[4]: *** [scripts/Makefile.build:360: arch/s390/kernel/mcount.o] Error 1
>> make[4]: *** Waiting for unfinished jobs....
>> make[3]: *** [scripts/Makefile.build:480: arch/s390/kernel] Error 2
>> make[2]: *** [scripts/Makefile.build:480: arch/s390] Error 2
>> make[2]: *** Waiting for unfinished jobs....
>> make[1]: *** [/data_nvme0n1/ryarob01/granule_perf/linux/Makefile:2035: .] Error 2
>> make: *** [Makefile:234: __sub-make] Error 2
>>
>> The assembler that comes with the kernel.org toolchain is from binutils 2.40.
>> Perhaps that's too old for this instruction? It's inside an "#ifdef
>> CONFIG_FUNCTION_GRAPH_TRACER" block, so I'm guessing whatever config I was
>> previously using didn't have that enabled. I'll try to disable some configs to
>> workaround. What assembler are you using?
> 
> Ah sorry about that, I forgot about that issue because I handled it
> during my bisect and all the manual reproduction/verification I have
> been doing in this thread has been against Andrew's -mm tree, which does
> not have that problem. Apply this patch on top of next-20230726 and
> everything should work...
> 
> https://lore.kernel.org/20230726061834.1300984-1-hca@linux.ibm.com/

No problem - I just got it working by disabling FUNCTION_TRACER, which avoids
aghik, and then disabling generation of BTF (which was complaining that pahole
wasn't available).

Anyway, I'm up and running now - can repro what you are seeing. Although it's
late in the UK now, so will have to look at this tomorrow. Hopefully I can get
to the bottom of it in reasonable time.

> 
> Cheers,
> Nathan
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 01f39e8144ef..189b1cfd823d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1391,6 +1391,94 @@  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
+static inline unsigned long page_cont_mapped_vaddr(struct page *page,
+				struct page *anchor, unsigned long anchor_vaddr)
+{
+	unsigned long offset;
+	unsigned long vaddr;
+
+	offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
+	vaddr = anchor_vaddr + offset;
+
+	if (anchor > page) {
+		if (vaddr > anchor_vaddr)
+			return 0;
+	} else {
+		if (vaddr < anchor_vaddr)
+			return ULONG_MAX;
+	}
+
+	return vaddr;
+}
+
+static int folio_nr_pages_cont_mapped(struct folio *folio,
+				      struct page *page, pte_t *pte,
+				      unsigned long addr, unsigned long end)
+{
+	pte_t ptent;
+	int floops;
+	int i;
+	unsigned long pfn;
+	struct page *folio_end;
+
+	if (!folio_test_large(folio))
+		return 1;
+
+	folio_end = &folio->page + folio_nr_pages(folio);
+	end = min(page_cont_mapped_vaddr(folio_end, page, addr), end);
+	floops = (end - addr) >> PAGE_SHIFT;
+	pfn = page_to_pfn(page);
+	pfn++;
+	pte++;
+
+	for (i = 1; i < floops; i++) {
+		ptent = ptep_get(pte);
+
+		if (!pte_present(ptent) || pte_pfn(ptent) != pfn)
+			break;
+
+		pfn++;
+		pte++;
+	}
+
+	return i;
+}
+
+static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb,
+					    struct vm_area_struct *vma,
+					    struct folio *folio,
+					    struct page *page, pte_t *pte,
+					    unsigned long addr, int nr_pages,
+					    struct zap_details *details)
+{
+	struct mm_struct *mm = tlb->mm;
+	pte_t ptent;
+	bool full;
+	int i;
+
+	for (i = 0; i < nr_pages;) {
+		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		tlb_remove_tlb_entry(tlb, pte, addr);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+		full = __tlb_remove_page(tlb, page, 0);
+
+		if (unlikely(page_mapcount(page) < 1))
+			print_bad_pte(vma, addr, ptent, page);
+
+		i++;
+		page++;
+		pte++;
+		addr += PAGE_SIZE;
+
+		if (unlikely(full))
+			break;
+	}
+
+	folio_remove_rmap_range(folio, page - i, i, vma);
+
+	return i;
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -1428,6 +1516,38 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(!should_zap_page(details, page)))
 				continue;
+
+			/*
+			 * Batch zap large anonymous folio mappings. This allows
+			 * batching the rmap removal, which means we avoid
+			 * spuriously adding a partially unmapped folio to the
+			 * deferrred split queue in the common case, which
+			 * reduces split queue lock contention.
+			 */
+			if (page && PageAnon(page)) {
+				struct folio *folio = page_folio(page);
+				int nr_pages_req, nr_pages;
+
+				nr_pages_req = folio_nr_pages_cont_mapped(
+						folio, page, pte, addr, end);
+
+				nr_pages = try_zap_anon_pte_range(tlb, vma,
+						folio, page, pte, addr,
+						nr_pages_req, details);
+
+				rss[mm_counter(page)] -= nr_pages;
+				nr_pages--;
+				pte += nr_pages;
+				addr += nr_pages << PAGE_SHIFT;
+
+				if (unlikely(nr_pages < nr_pages_req)) {
+					force_flush = 1;
+					addr += PAGE_SIZE;
+					break;
+				}
+				continue;
+			}
+
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);