diff mbox series

[v2,07/13] mm: Remove HUGETLB_PAGE_DTOR

Message ID 20230816151201.3655946-8-willy@infradead.org (mailing list archive)
State New
Headers show
Series Remove _folio_dtor and _folio_order | expand

Commit Message

Matthew Wilcox Aug. 16, 2023, 3:11 p.m. UTC
We can use a bit in page[1].flags to indicate that this folio belongs
to hugetlb instead of using a value in page[1].dtors.  That lets
folio_test_hugetlb() become an inline function like it should be.
We can also get rid of NULL_COMPOUND_DTOR.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 .../admin-guide/kdump/vmcoreinfo.rst          | 10 +---
 include/linux/mm.h                            |  4 --
 include/linux/page-flags.h                    | 43 ++++++++++++----
 kernel/crash_core.c                           |  2 +-
 mm/hugetlb.c                                  | 49 +++----------------
 mm/page_alloc.c                               |  2 +-
 6 files changed, 43 insertions(+), 67 deletions(-)

Comments

Sidhartha Kumar Aug. 16, 2023, 9:45 p.m. UTC | #1
On 8/16/23 8:11 AM, Matthew Wilcox (Oracle) wrote:
> We can use a bit in page[1].flags to indicate that this folio belongs
> to hugetlb instead of using a value in page[1].dtors.  That lets
> folio_test_hugetlb() become an inline function like it should be.
> We can also get rid of NULL_COMPOUND_DTOR.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>   .../admin-guide/kdump/vmcoreinfo.rst          | 10 +---
>   include/linux/mm.h                            |  4 --
>   include/linux/page-flags.h                    | 43 ++++++++++++----
>   kernel/crash_core.c                           |  2 +-
>   mm/hugetlb.c                                  | 49 +++----------------
>   mm/page_alloc.c                               |  2 +-
>   6 files changed, 43 insertions(+), 67 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> index c18d94fa6470..baa1c355741d 100644
> --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> @@ -325,8 +325,8 @@ NR_FREE_PAGES
>   On linux-2.6.21 or later, the number of free pages is in
>   vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
>   
> -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask
> -------------------------------------------------------------------------------
> +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|PG_hugetlb
> +-----------------------------------------------------------------------------------------
>   
>   Page attributes. These flags are used to filter various unnecessary for
>   dumping pages.
> @@ -338,12 +338,6 @@ More page attributes. These flags are used to filter various unnecessary for
>   dumping pages.
>   
>   
> -HUGETLB_PAGE_DTOR
> ------------------
> -
> -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
> -excludes these pages.
> -
>   x86_64
>   ======
>   
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7b800d1298dc..642f5fe5860e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1268,11 +1268,7 @@ void folio_copy(struct folio *dst, struct folio *src);
>   unsigned long nr_free_buffer_pages(void);
>   
>   enum compound_dtor_id {
> -	NULL_COMPOUND_DTOR,
>   	COMPOUND_PAGE_DTOR,
> -#ifdef CONFIG_HUGETLB_PAGE
> -	HUGETLB_PAGE_DTOR,
> -#endif
>   	TRANSHUGE_PAGE_DTOR,
>   	NR_COMPOUND_DTORS,
>   };
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 92a2063a0a23..aeecf0cf1456 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -171,15 +171,6 @@ enum pageflags {
>   	/* Remapped by swiotlb-xen. */
>   	PG_xen_remapped = PG_owner_priv_1,
>   
> -#ifdef CONFIG_MEMORY_FAILURE
> -	/*
> -	 * Compound pages. Stored in first tail page's flags.
> -	 * Indicates that at least one subpage is hwpoisoned in the
> -	 * THP.
> -	 */
> -	PG_has_hwpoisoned = PG_error,
> -#endif
> -
>   	/* non-lru isolated movable page */
>   	PG_isolated = PG_reclaim,
>   
> @@ -190,6 +181,15 @@ enum pageflags {
>   	/* For self-hosted memmap pages */
>   	PG_vmemmap_self_hosted = PG_owner_priv_1,
>   #endif
> +
> +	/*
> +	 * Flags only valid for compound pages.  Stored in first tail page's
> +	 * flags word.
> +	 */
> +
> +	/* At least one page in this folio has the hwpoison flag set */
> +	PG_has_hwpoisoned = PG_error,
> +	PG_hugetlb = PG_active,
>   };
>   
>   #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
> @@ -812,7 +812,23 @@ static inline void ClearPageCompound(struct page *page)
>   
>   #ifdef CONFIG_HUGETLB_PAGE
>   int PageHuge(struct page *page);
> -bool folio_test_hugetlb(struct folio *folio);
> +SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
> +CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
> +
> +/**
> + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
> + * @folio: The folio to test.
> + *
> + * Context: Any context.  Caller should have a reference on the folio to
> + * prevent it from being turned into a tail page.
> + * Return: True for hugetlbfs folios, false for anon folios or folios
> + * belonging to other filesystems.
> + */
> +static inline bool folio_test_hugetlb(struct folio *folio)
> +{
> +	return folio_test_large(folio) &&
> +		test_bit(PG_hugetlb, folio_flags(folio, 1));
> +}
>   #else
>   TESTPAGEFLAG_FALSE(Huge, hugetlb)
>   #endif
> @@ -1040,6 +1056,13 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
>   #define PAGE_FLAGS_CHECK_AT_PREP	\
>   	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
>   
> +/*
> + * Flags stored in the second page of a compound page.  They may overlap
> + * the CHECK_AT_FREE flags above, so need to be cleared.
> + */
> +#define PAGE_FLAGS_SECOND						\
> +	(1UL << PG_has_hwpoisoned	| 1UL << PG_hugetlb)
> +
>   #define PAGE_FLAGS_PRIVATE				\
>   	(1UL << PG_private | 1UL << PG_private_2)
>   /**
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 90ce1dfd591c..dd5f87047d06 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -490,7 +490,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>   #define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
>   	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
>   #ifdef CONFIG_HUGETLB_PAGE
> -	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
> +	VMCOREINFO_NUMBER(PG_hugetlb);
>   #define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
>   	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
>   #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 086eb51bf845..389490f100b0 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1585,25 +1585,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h,
>   {
>   	lockdep_assert_held(&hugetlb_lock);
>   
> -	/*
> -	 * Very subtle
> -	 *
> -	 * For non-gigantic pages set the destructor to the normal compound
> -	 * page dtor.  This is needed in case someone takes an additional
> -	 * temporary ref to the page, and freeing is delayed until they drop
> -	 * their reference.
> -	 *
> -	 * For gigantic pages set the destructor to the null dtor.  This
> -	 * destructor will never be called.  Before freeing the gigantic
> -	 * page destroy_compound_gigantic_folio will turn the folio into a
> -	 * simple group of pages.  After this the destructor does not
> -	 * apply.
> -	 *
> -	 */
> -	if (hstate_is_gigantic(h))
> -		folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
> -	else
> -		folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
> +	folio_clear_hugetlb(folio);
>   }
>   
>   /*
> @@ -1690,7 +1672,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
>   		h->surplus_huge_pages_node[nid]++;
>   	}
>   
> -	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
> +	folio_set_hugetlb(folio);
>   	folio_change_private(folio, NULL);
>   	/*
>   	 * We have to set hugetlb_vmemmap_optimized again as above
> @@ -1814,9 +1796,8 @@ static void free_hpage_workfn(struct work_struct *work)
>   		/*
>   		 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
>   		 * folio_hstate() is going to trigger because a previous call to
> -		 * remove_hugetlb_folio() will call folio_set_compound_dtor
> -		 * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate()
> -		 * directly.
> +		 * remove_hugetlb_folio() will clear the hugetlb bit, so do
> +		 * not use folio_hstate() directly.
>   		 */
>   		h = size_to_hstate(page_size(page));
>   
> @@ -1955,7 +1936,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
>   {
>   	hugetlb_vmemmap_optimize(h, &folio->page);
>   	INIT_LIST_HEAD(&folio->lru);
> -	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);

Hi Matthew,

In __prep_compound_gigantic_folio() there is still the comment:
/* we rely on prep_new_hugetlb_folio to set the destructor */
should that be modified to reference the hugetlb flag?

Thanks,
Sid

> +	folio_set_hugetlb(folio);
>   	hugetlb_set_folio_subpool(folio, NULL);
>   	set_hugetlb_cgroup(folio, NULL);
>   	set_hugetlb_cgroup_rsvd(folio, NULL);
> @@ -2070,28 +2051,10 @@ int PageHuge(struct page *page)
>   	if (!PageCompound(page))
>   		return 0;
>   	folio = page_folio(page);
> -	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
> +	return folio_test_hugetlb(folio);
>   }
>   EXPORT_SYMBOL_GPL(PageHuge);
>   
> -/**
> - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
> - * @folio: The folio to test.
> - *
> - * Context: Any context.  Caller should have a reference on the folio to
> - * prevent it from being turned into a tail page.
> - * Return: True for hugetlbfs folios, false for anon folios or folios
> - * belonging to other filesystems.
> - */
> -bool folio_test_hugetlb(struct folio *folio)
> -{
> -	if (!folio_test_large(folio))
> -		return false;
> -
> -	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
> -}
> -EXPORT_SYMBOL_GPL(folio_test_hugetlb);
> -
>   /*
>    * Find and lock address space (mapping) in write mode.
>    *
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9638fdddf065..f8e276de4fd5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1122,7 +1122,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
>   		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
>   
>   		if (compound)
> -			ClearPageHasHWPoisoned(page);
> +			page[1].flags &= ~PAGE_FLAGS_SECOND;
>   		for (i = 1; i < (1 << order); i++) {
>   			if (compound)
>   				bad += free_tail_page_prepare(page, page + i);
Mike Kravetz Aug. 22, 2023, 3:13 a.m. UTC | #2
On 08/16/23 16:11, Matthew Wilcox (Oracle) wrote:
> We can use a bit in page[1].flags to indicate that this folio belongs
> to hugetlb instead of using a value in page[1].dtors.  That lets
> folio_test_hugetlb() become an inline function like it should be.
> We can also get rid of NULL_COMPOUND_DTOR.

Not 100% sure yet, but I suspect this patch/series causes the following
BUG in today's linux-next.  I can do a deeper dive tomorrow.

# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# echo 0 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages

[  352.631099] page:ffffea0007a30000 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1e8c00
[  352.633674] head:ffffea0007a30000 order:8 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[  352.636021] flags: 0x200000000000040(head|node=0|zone=2)
[  352.637424] page_type: 0xffffffff()
[  352.638619] raw: 0200000000000040 ffffc90003bb3d98 ffffc90003bb3d98 0000000000000000
[  352.640689] raw: 0000000000000001 0000000000000008 00000000ffffffff 0000000000000000
[  352.642882] page dumped because: VM_BUG_ON_PAGE(compound && compound_order(page) != order)
[  352.644671] ------------[ cut here ]------------
[  352.645746] kernel BUG at mm/page_alloc.c:1101!
[  352.647284] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[  352.649286] CPU: 2 PID: 894 Comm: bash Not tainted 6.5.0-rc7-next-20230821-dirty #178
[  352.651245] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[  352.653349] RIP: 0010:free_unref_page_prepare+0x3c4/0x3e0
[  352.654731] Code: ff 0f 00 00 75 d4 48 8b 03 a8 40 74 cd 48 8b 43 48 a8 01 74 c5 48 8d 78 ff eb bf 48 c7 c6 10 90 22 82 48 89 df e8 fc e1 fc ff <0f> 0b 48 c7 c6 20 2e 22 82 e8 ee e1 fc ff 0f 0b 66 66 2e 0f 1f 84
[  352.660080] RSP: 0018:ffffc90003bb3d08 EFLAGS: 00010246
[  352.661457] RAX: 000000000000004e RBX: ffffea0007a30000 RCX: 0000000000000000
[  352.663119] RDX: 0000000000000000 RSI: ffffffff8224cc56 RDI: 00000000ffffffff
[  352.664697] RBP: 00000000001e8c00 R08: 0000000000009ffb R09: 00000000ffffdfff
[  352.666191] R10: 00000000ffffdfff R11: ffffffff824660c0 R12: 0000000000000009
[  352.667612] R13: 00000000001e8c00 R14: 0000000000000000 R15: 0000000000000000
[  352.669033] FS:  00007f18fc3a0740(0000) GS:ffff888477c00000(0000) knlGS:0000000000000000
[  352.670654] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  352.671678] CR2: 000055b1b1f16018 CR3: 0000000302e5a002 CR4: 0000000000370ee0
[  352.672936] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  352.674215] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  352.675399] Call Trace:
[  352.675907]  <TASK>
[  352.676400]  ? die+0x32/0x80
[  352.676974]  ? do_trap+0xd6/0x100
[  352.677691]  ? free_unref_page_prepare+0x3c4/0x3e0
[  352.678591]  ? do_error_trap+0x6a/0x90
[  352.679343]  ? free_unref_page_prepare+0x3c4/0x3e0
[  352.680245]  ? exc_invalid_op+0x49/0x60
[  352.680961]  ? free_unref_page_prepare+0x3c4/0x3e0
[  352.681851]  ? asm_exc_invalid_op+0x16/0x20
[  352.682607]  ? free_unref_page_prepare+0x3c4/0x3e0
[  352.683518]  ? free_unref_page_prepare+0x3c4/0x3e0
[  352.684387]  free_unref_page+0x34/0x160
[  352.685176]  ? _raw_spin_lock_irq+0x19/0x40
[  352.686641]  set_max_huge_pages+0x281/0x370
[  352.687621]  nr_hugepages_store_common+0x91/0xf0
[  352.688634]  kernfs_fop_write_iter+0x108/0x1f0
[  352.689619]  vfs_write+0x207/0x400
[  352.690423]  ksys_write+0x63/0xe0
[  352.691198]  do_syscall_64+0x37/0x90
[  352.691938]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  352.692847] RIP: 0033:0x7f18fc494e87
[  352.693481] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[  352.696304] RSP: 002b:00007ffff7597318 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  352.697539] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f18fc494e87
[  352.698790] RDX: 0000000000000002 RSI: 000055b1b1ed3620 RDI: 0000000000000001
[  352.700238] RBP: 000055b1b1ed3620 R08: 000000000000000a R09: 00007f18fc52c0c0
[  352.701662] R10: 00007f18fc52bfc0 R11: 0000000000000246 R12: 0000000000000002
[  352.703112] R13: 00007f18fc568520 R14: 0000000000000002 R15: 00007f18fc568720
[  352.704547]  </TASK>
[  352.705101] Modules linked in: rfkill ip6table_filter ip6_tables sunrpc snd_hda_codec_generic snd_hda_intel snd_intel_dspcfg snd_hda_codec 9p snd_hwdep joydev snd_hda_core netfs snd_seq snd_seq_device snd_pcm snd_timer 9pnet_virtio snd soundcore virtio_balloon 9pnet virtio_net net_failover virtio_console virtio_blk failover crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel serio_raw virtio_pci virtio virtio_pci_legacy_dev virtio_pci_modern_dev virtio_ring fuse
[  352.713435] ---[ end trace 0000000000000000 ]---
[  352.714725] RIP: 0010:free_unref_page_prepare+0x3c4/0x3e0
[  352.717293] Code: ff 0f 00 00 75 d4 48 8b 03 a8 40 74 cd 48 8b 43 48 a8 01 74 c5 48 8d 78 ff eb bf 48 c7 c6 10 90 22 82 48 89 df e8 fc e1 fc ff <0f> 0b 48 c7 c6 20 2e 22 82 e8 ee e1 fc ff 0f 0b 66 66 2e 0f 1f 84
[  352.721235] RSP: 0018:ffffc90003bb3d08 EFLAGS: 00010246
[  352.722388] RAX: 000000000000004e RBX: ffffea0007a30000 RCX: 0000000000000000
[  352.723909] RDX: 0000000000000000 RSI: ffffffff8224cc56 RDI: 00000000ffffffff
[  352.725408] RBP: 00000000001e8c00 R08: 0000000000009ffb R09: 00000000ffffdfff
[  352.726875] R10: 00000000ffffdfff R11: ffffffff824660c0 R12: 0000000000000009
[  352.728352] R13: 00000000001e8c00 R14: 0000000000000000 R15: 0000000000000000
[  352.729821] FS:  00007f18fc3a0740(0000) GS:ffff888477c00000(0000) knlGS:0000000000000000
[  352.731511] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  352.732723] CR2: 000055b1b1f16018 CR3: 0000000302e5a002 CR4: 0000000000370ee0
[  352.734199] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  352.735681] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Matthew Wilcox Aug. 22, 2023, 3:32 a.m. UTC | #3
On Mon, Aug 21, 2023 at 08:13:00PM -0700, Mike Kravetz wrote:
> On 08/16/23 16:11, Matthew Wilcox (Oracle) wrote:
> > We can use a bit in page[1].flags to indicate that this folio belongs
> > to hugetlb instead of using a value in page[1].dtors.  That lets
> > folio_test_hugetlb() become an inline function like it should be.
> > We can also get rid of NULL_COMPOUND_DTOR.
> 
> Not 100% sure yet, but I suspect this patch/series causes the following
> BUG in today's linux-next.  I can do a deeper dive tomorrow.
> 
> # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> # echo 0 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> 
> [  352.631099] page:ffffea0007a30000 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1e8c00
> [  352.633674] head:ffffea0007a30000 order:8 entire_mapcount:0 nr_pages_mapped:0 pincount:0

order 8?  Well, that's exciting.  This is clearly x86, so it should be
order 9.  Did we mistakenly clear bit 0 of tail->flags?

Oh.  Oh yes, we do.

__update_and_free_hugetlb_folio:

        for (i = 0; i < pages_per_huge_page(h); i++) {
                subpage = folio_page(folio, i);
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }

locked		PF_NO_TAIL
error		PF_NO_TAIL
referenced	PF_HEAD
dirty		PF_HEAD
active		PF_HEAD
private		PF_ANY
writeback	PF_NO_TAIL

So it seems to me there's no way to set any of these bits other than
PG_private.  And, well, you control PG_private in hugetlbfs.  Do you
ever set it on tail pages?

I think you can remove this entire loop and be happy?
Mike Kravetz Aug. 22, 2023, 5:19 p.m. UTC | #4
On 08/22/23 04:32, Matthew Wilcox wrote:
> On Mon, Aug 21, 2023 at 08:13:00PM -0700, Mike Kravetz wrote:
> > On 08/16/23 16:11, Matthew Wilcox (Oracle) wrote:
> > > We can use a bit in page[1].flags to indicate that this folio belongs
> > > to hugetlb instead of using a value in page[1].dtors.  That lets
> > > folio_test_hugetlb() become an inline function like it should be.
> > > We can also get rid of NULL_COMPOUND_DTOR.
> > 
> > Not 100% sure yet, but I suspect this patch/series causes the following
> > BUG in today's linux-next.  I can do a deeper dive tomorrow.
> > 
> > # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> > # echo 0 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> > 
> > [  352.631099] page:ffffea0007a30000 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1e8c00
> > [  352.633674] head:ffffea0007a30000 order:8 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> 
> order 8?  Well, that's exciting.  This is clearly x86, so it should be
> order 9.  Did we mistakenly clear bit 0 of tail->flags?
> 
> Oh.  Oh yes, we do.
> 
> __update_and_free_hugetlb_folio:
> 
>         for (i = 0; i < pages_per_huge_page(h); i++) {
>                 subpage = folio_page(folio, i);
>                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
>                                 1 << PG_referenced | 1 << PG_dirty |
>                                 1 << PG_active | 1 << PG_private |
>                                 1 << PG_writeback);
>         }
> 
> locked		PF_NO_TAIL
> error		PF_NO_TAIL
> referenced	PF_HEAD
> dirty		PF_HEAD
> active		PF_HEAD
> private		PF_ANY
> writeback	PF_NO_TAIL
> 
> So it seems to me there's no way to set any of these bits other than
> PG_private.  And, well, you control PG_private in hugetlbfs.  Do you
> ever set it on tail pages?

Nope

> 
> I think you can remove this entire loop and be happy?

I believe you are correct.  Although, this is clearing flags in the head
page as well as tail pages.  So, I think we still need to clear referenced,
dirty and active as well as private on the head page.

Will take a look today.
Vlastimil Babka Dec. 8, 2023, 5:54 p.m. UTC | #5
On 8/16/23 17:11, Matthew Wilcox (Oracle) wrote:
> We can use a bit in page[1].flags to indicate that this folio belongs
> to hugetlb instead of using a value in page[1].dtors.  That lets
> folio_test_hugetlb() become an inline function like it should be.
> We can also get rid of NULL_COMPOUND_DTOR.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

I think this (as commit 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR")) is
causing the bug reported by Luis here:
https://bugzilla.kernel.org/show_bug.cgi?id=218227

> page:000000009006bf10 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x3f8a0 pfn:0x1035c0
> flags: 0x17fffc000000000(node=0|zone=2|lastcpupid=0x1ffff)
> page_type: 0xffffff7f(buddy)
> raw: 017fffc000000000 ffffe704c422f808 ffffe704c41ac008 0000000000000000
> raw: 000000000003f8a0 0000000000000005 00000000ffffff7f 0000000000000000
> page dumped because: VM_BUG_ON_PAGE(n > 0 && !((__builtin_constant_p(PG_head) && __builtin_constant_p((uintptr_t)(&page->flags) != (uintptr_t)((vo>
> ------------[ cut here ]------------
> kernel BUG at include/linux/page-flags.h:314!
> invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 6 PID: 2435641 Comm: md5sum Not tainted 6.6.0-rc5 #2
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> RIP: 0010:folio_flags+0x65/0x70
> Code: a8 40 74 de 48 8b 47 48 a8 01 74 d6 48 83 e8 01 48 39 c7 75 bd eb cb 48 8b 07 a8 40 75 c8 48 c7 c6 d8 a7 c3 89 e8 3b c7 fa ff <0f> 0b 66 0f >
> RSP: 0018:ffffad51c0bfb7a8 EFLAGS: 00010246
> RAX: 000000000000015f RBX: ffffe704c40d7000 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: ffffffff89be8040 RDI: 00000000ffffffff
> RBP: 0000000000103600 R08: 0000000000000000 R09: ffffad51c0bfb658
> R10: 0000000000000003 R11: ffffffff89eacb08 R12: 0000000000000035
> R13: ffffe704c40d7000 R14: 0000000000000000 R15: ffffad51c0bfb930
> FS:  00007f350c51b740(0000) GS:ffff9b62fbd80000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000555860919508 CR3: 00000001217fe002 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
>  <TASK>
>  ? die+0x32/0x80
>  ? do_trap+0xd6/0x100
>  ? folio_flags+0x65/0x70
>  ? do_error_trap+0x6a/0x90
>  ? folio_flags+0x65/0x70
>  ? exc_invalid_op+0x4c/0x60
>  ? folio_flags+0x65/0x70
>  ? asm_exc_invalid_op+0x16/0x20
>  ? folio_flags+0x65/0x70
>  ? folio_flags+0x65/0x70
>  PageHuge+0x67/0x80
>  isolate_migratepages_block+0x1c5/0x13b0
>  ? __pv_queued_spin_lock_slowpath+0x16c/0x370
>  compact_zone+0x746/0xfc0
>  compact_zone_order+0xbb/0x100
>  try_to_compact_pages+0xf0/0x2f0
>  __alloc_pages_direct_compact+0x78/0x210
>  __alloc_pages_slowpath.constprop.0+0xac1/0xdb0
>  ? prepare_alloc_pages.constprop.0+0xff/0x1b0
>  __alloc_pages+0x218/0x240
>  folio_alloc+0x17/0x50
>  page_cache_ra_order+0x15a/0x340
>  filemap_get_pages+0x136/0x6c0
>  ? update_load_avg+0x7e/0x780
>  ? current_time+0x2b/0xd0
>  filemap_read+0xce/0x340
>  ? do_sched_setscheduler+0x111/0x1b0
>  ? nohz_balance_exit_idle+0x16/0xc0
>  ? trigger_load_balance+0x302/0x370
>  ? preempt_count_add+0x47/0xa0
>  xfs_file_buffered_read+0x52/0xd0 [xfs]
>  xfs_file_read_iter+0x73/0xe0 [xfs]
>  vfs_read+0x1b1/0x300
>  ksys_read+0x63/0xe0
>  do_syscall_64+0x38/0x90
>  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
> RIP: 0033:0x7f350c615a5d
> Code: 31 c0 e9 c6 fe ff ff 50 48 8d 3d a6 60 0a 00 e8 99 08 02 00 66 0f 1f 84 00 00 00 00 00 80 3d 81 3b 0e 00 00 74 17 31 c0 0f 05 <48> 3d 00 f0 >
> RSP: 002b:00007ffca3ef5108 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 000055ff140712f0 RCX: 00007f350c615a5d
> RDX: 0000000000008000 RSI: 000055ff140714d0 RDI: 0000000000000003
> RBP: 00007f350c6ee600 R08: 0000000000000900 R09: 000000000b9bc05b
> R10: 000055ff140794d0 R11: 0000000000000246 R12: 000055ff140714d0
> R13: 0000000000008000 R14: 0000000000000a68 R15: 00007f350c6edd00
>  </TASK>
> Modules linked in: dm_zero dm_thin_pool dm_persistent_data dm_bio_prison sd_mod sg scsi_mod scsi_common dm_snapshot dm_bufio dm_flakey xfs sunrpc >
> ---[ end trace 0000000000000000 ]---

It's because PageHuge() now does folio_test_hugetlb() which is documented to
assume caller holds a reference, but the test in
isolate_migratepages_block() doesn't. The code is there from 2021 by Oscar,
perhaps it could be changed to take a reference (and e.g. only test
PageHead() before), but it will be a bit involved as the
isolate_or_dissolve_huge_page() it calls has some logic based on the
refcount being zero/non-zero as well. Oscar, what do you think?
Also I wonder if any of the the other existing PageHuge() callers are also
affected because they might be doing so without a reference.

(keeping the rest of patch for new Cc's)

> ---
>  .../admin-guide/kdump/vmcoreinfo.rst          | 10 +---
>  include/linux/mm.h                            |  4 --
>  include/linux/page-flags.h                    | 43 ++++++++++++----
>  kernel/crash_core.c                           |  2 +-
>  mm/hugetlb.c                                  | 49 +++----------------
>  mm/page_alloc.c                               |  2 +-
>  6 files changed, 43 insertions(+), 67 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> index c18d94fa6470..baa1c355741d 100644
> --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> @@ -325,8 +325,8 @@ NR_FREE_PAGES
>  On linux-2.6.21 or later, the number of free pages is in
>  vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
>  
> -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask
> -------------------------------------------------------------------------------
> +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|PG_hugetlb
> +-----------------------------------------------------------------------------------------
>  
>  Page attributes. These flags are used to filter various unnecessary for
>  dumping pages.
> @@ -338,12 +338,6 @@ More page attributes. These flags are used to filter various unnecessary for
>  dumping pages.
>  
>  
> -HUGETLB_PAGE_DTOR
> ------------------
> -
> -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
> -excludes these pages.
> -
>  x86_64
>  ======
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7b800d1298dc..642f5fe5860e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1268,11 +1268,7 @@ void folio_copy(struct folio *dst, struct folio *src);
>  unsigned long nr_free_buffer_pages(void);
>  
>  enum compound_dtor_id {
> -	NULL_COMPOUND_DTOR,
>  	COMPOUND_PAGE_DTOR,
> -#ifdef CONFIG_HUGETLB_PAGE
> -	HUGETLB_PAGE_DTOR,
> -#endif
>  	TRANSHUGE_PAGE_DTOR,
>  	NR_COMPOUND_DTORS,
>  };
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 92a2063a0a23..aeecf0cf1456 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -171,15 +171,6 @@ enum pageflags {
>  	/* Remapped by swiotlb-xen. */
>  	PG_xen_remapped = PG_owner_priv_1,
>  
> -#ifdef CONFIG_MEMORY_FAILURE
> -	/*
> -	 * Compound pages. Stored in first tail page's flags.
> -	 * Indicates that at least one subpage is hwpoisoned in the
> -	 * THP.
> -	 */
> -	PG_has_hwpoisoned = PG_error,
> -#endif
> -
>  	/* non-lru isolated movable page */
>  	PG_isolated = PG_reclaim,
>  
> @@ -190,6 +181,15 @@ enum pageflags {
>  	/* For self-hosted memmap pages */
>  	PG_vmemmap_self_hosted = PG_owner_priv_1,
>  #endif
> +
> +	/*
> +	 * Flags only valid for compound pages.  Stored in first tail page's
> +	 * flags word.
> +	 */
> +
> +	/* At least one page in this folio has the hwpoison flag set */
> +	PG_has_hwpoisoned = PG_error,
> +	PG_hugetlb = PG_active,
>  };
>  
>  #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
> @@ -812,7 +812,23 @@ static inline void ClearPageCompound(struct page *page)
>  
>  #ifdef CONFIG_HUGETLB_PAGE
>  int PageHuge(struct page *page);
> -bool folio_test_hugetlb(struct folio *folio);
> +SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
> +CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
> +
> +/**
> + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
> + * @folio: The folio to test.
> + *
> + * Context: Any context.  Caller should have a reference on the folio to
> + * prevent it from being turned into a tail page.
> + * Return: True for hugetlbfs folios, false for anon folios or folios
> + * belonging to other filesystems.
> + */
> +static inline bool folio_test_hugetlb(struct folio *folio)
> +{
> +	return folio_test_large(folio) &&
> +		test_bit(PG_hugetlb, folio_flags(folio, 1));
> +}
>  #else
>  TESTPAGEFLAG_FALSE(Huge, hugetlb)
>  #endif
> @@ -1040,6 +1056,13 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
>  #define PAGE_FLAGS_CHECK_AT_PREP	\
>  	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
>  
> +/*
> + * Flags stored in the second page of a compound page.  They may overlap
> + * the CHECK_AT_FREE flags above, so need to be cleared.
> + */
> +#define PAGE_FLAGS_SECOND						\
> +	(1UL << PG_has_hwpoisoned	| 1UL << PG_hugetlb)
> +
>  #define PAGE_FLAGS_PRIVATE				\
>  	(1UL << PG_private | 1UL << PG_private_2)
>  /**
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 90ce1dfd591c..dd5f87047d06 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -490,7 +490,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>  #define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
>  	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
>  #ifdef CONFIG_HUGETLB_PAGE
> -	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
> +	VMCOREINFO_NUMBER(PG_hugetlb);
>  #define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
>  	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
>  #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 086eb51bf845..389490f100b0 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1585,25 +1585,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h,
>  {
>  	lockdep_assert_held(&hugetlb_lock);
>  
> -	/*
> -	 * Very subtle
> -	 *
> -	 * For non-gigantic pages set the destructor to the normal compound
> -	 * page dtor.  This is needed in case someone takes an additional
> -	 * temporary ref to the page, and freeing is delayed until they drop
> -	 * their reference.
> -	 *
> -	 * For gigantic pages set the destructor to the null dtor.  This
> -	 * destructor will never be called.  Before freeing the gigantic
> -	 * page destroy_compound_gigantic_folio will turn the folio into a
> -	 * simple group of pages.  After this the destructor does not
> -	 * apply.
> -	 *
> -	 */
> -	if (hstate_is_gigantic(h))
> -		folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
> -	else
> -		folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
> +	folio_clear_hugetlb(folio);
>  }
>  
>  /*
> @@ -1690,7 +1672,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
>  		h->surplus_huge_pages_node[nid]++;
>  	}
>  
> -	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
> +	folio_set_hugetlb(folio);
>  	folio_change_private(folio, NULL);
>  	/*
>  	 * We have to set hugetlb_vmemmap_optimized again as above
> @@ -1814,9 +1796,8 @@ static void free_hpage_workfn(struct work_struct *work)
>  		/*
>  		 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
>  		 * folio_hstate() is going to trigger because a previous call to
> -		 * remove_hugetlb_folio() will call folio_set_compound_dtor
> -		 * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate()
> -		 * directly.
> +		 * remove_hugetlb_folio() will clear the hugetlb bit, so do
> +		 * not use folio_hstate() directly.
>  		 */
>  		h = size_to_hstate(page_size(page));
>  
> @@ -1955,7 +1936,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
>  {
>  	hugetlb_vmemmap_optimize(h, &folio->page);
>  	INIT_LIST_HEAD(&folio->lru);
> -	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
> +	folio_set_hugetlb(folio);
>  	hugetlb_set_folio_subpool(folio, NULL);
>  	set_hugetlb_cgroup(folio, NULL);
>  	set_hugetlb_cgroup_rsvd(folio, NULL);
> @@ -2070,28 +2051,10 @@ int PageHuge(struct page *page)
>  	if (!PageCompound(page))
>  		return 0;
>  	folio = page_folio(page);
> -	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
> +	return folio_test_hugetlb(folio);
>  }
>  EXPORT_SYMBOL_GPL(PageHuge);
>  
> -/**
> - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
> - * @folio: The folio to test.
> - *
> - * Context: Any context.  Caller should have a reference on the folio to
> - * prevent it from being turned into a tail page.
> - * Return: True for hugetlbfs folios, false for anon folios or folios
> - * belonging to other filesystems.
> - */
> -bool folio_test_hugetlb(struct folio *folio)
> -{
> -	if (!folio_test_large(folio))
> -		return false;
> -
> -	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
> -}
> -EXPORT_SYMBOL_GPL(folio_test_hugetlb);
> -
>  /*
>   * Find and lock address space (mapping) in write mode.
>   *
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9638fdddf065..f8e276de4fd5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1122,7 +1122,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
>  		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
>  
>  		if (compound)
> -			ClearPageHasHWPoisoned(page);
> +			page[1].flags &= ~PAGE_FLAGS_SECOND;
>  		for (i = 1; i < (1 << order); i++) {
>  			if (compound)
>  				bad += free_tail_page_prepare(page, page + i);
Matthew Wilcox Dec. 8, 2023, 6:31 p.m. UTC | #6
On Fri, Dec 08, 2023 at 06:54:19PM +0100, Vlastimil Babka wrote:
> On 8/16/23 17:11, Matthew Wilcox (Oracle) wrote:
> > We can use a bit in page[1].flags to indicate that this folio belongs
> > to hugetlb instead of using a value in page[1].dtors.  That lets
> > folio_test_hugetlb() become an inline function like it should be.
> > We can also get rid of NULL_COMPOUND_DTOR.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> 
> I think this (as commit 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR")) is
> causing the bug reported by Luis here:
> https://bugzilla.kernel.org/show_bug.cgi?id=218227

Luis, please stop using bugzilla.  If you'd sent email like a normal
kernel developer, I'd've seen this bug earlier.

> > page:000000009006bf10 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x3f8a0 pfn:0x1035c0
> > flags: 0x17fffc000000000(node=0|zone=2|lastcpupid=0x1ffff)
> > page_type: 0xffffff7f(buddy)
> > raw: 017fffc000000000 ffffe704c422f808 ffffe704c41ac008 0000000000000000
> > raw: 000000000003f8a0 0000000000000005 00000000ffffff7f 0000000000000000
> > page dumped because: VM_BUG_ON_PAGE(n > 0 && !((__builtin_constant_p(PG_head) && __builtin_constant_p((uintptr_t)(&page->flags) != (uintptr_t)((vo>
> > ------------[ cut here ]------------
> > kernel BUG at include/linux/page-flags.h:314!
> > invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > CPU: 6 PID: 2435641 Comm: md5sum Not tainted 6.6.0-rc5 #2
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > RIP: 0010:folio_flags+0x65/0x70
> > Code: a8 40 74 de 48 8b 47 48 a8 01 74 d6 48 83 e8 01 48 39 c7 75 bd eb cb 48 8b 07 a8 40 75 c8 48 c7 c6 d8 a7 c3 89 e8 3b c7 fa ff <0f> 0b 66 0f >
> > RSP: 0018:ffffad51c0bfb7a8 EFLAGS: 00010246
> > RAX: 000000000000015f RBX: ffffe704c40d7000 RCX: 0000000000000000
> > RDX: 0000000000000000 RSI: ffffffff89be8040 RDI: 00000000ffffffff
> > RBP: 0000000000103600 R08: 0000000000000000 R09: ffffad51c0bfb658
> > R10: 0000000000000003 R11: ffffffff89eacb08 R12: 0000000000000035
> > R13: ffffe704c40d7000 R14: 0000000000000000 R15: ffffad51c0bfb930
> > FS:  00007f350c51b740(0000) GS:ffff9b62fbd80000(0000) knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 0000555860919508 CR3: 00000001217fe002 CR4: 0000000000770ee0
> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > PKRU: 55555554
> > Call Trace:
> >  <TASK>
> >  ? die+0x32/0x80
> >  ? do_trap+0xd6/0x100
> >  ? folio_flags+0x65/0x70
> >  ? do_error_trap+0x6a/0x90
> >  ? folio_flags+0x65/0x70
> >  ? exc_invalid_op+0x4c/0x60
> >  ? folio_flags+0x65/0x70
> >  ? asm_exc_invalid_op+0x16/0x20
> >  ? folio_flags+0x65/0x70
> >  ? folio_flags+0x65/0x70
> >  PageHuge+0x67/0x80
> >  isolate_migratepages_block+0x1c5/0x13b0
> >  compact_zone+0x746/0xfc0
> >  compact_zone_order+0xbb/0x100
> >  try_to_compact_pages+0xf0/0x2f0
> >  __alloc_pages_direct_compact+0x78/0x210
> >  __alloc_pages_slowpath.constprop.0+0xac1/0xdb0
> >  __alloc_pages+0x218/0x240
> >  folio_alloc+0x17/0x50
> 
> It's because PageHuge() now does folio_test_hugetlb() which is documented to
> assume caller holds a reference, but the test in
> isolate_migratepages_block() doesn't. The code is there from 2021 by Oscar,
> perhaps it could be changed to take a reference (and e.g. only test
> PageHead() before), but it will be a bit involved as the
> isolate_or_dissolve_huge_page() it calls has some logic based on the
> refcount being zero/non-zero as well. Oscar, what do you think?
> Also I wonder if any of the the other existing PageHuge() callers are also
> affected because they might be doing so without a reference.

I don't think the warning is actually wrong!  We're living very
dangerously here as PageHuge() could have returned a false positive
before this change [1].  Then we assume that compound_nr() returns a
consistent result (and compound_order() might, for example, return a
value larger than 63, leading to UB).

I think there's a place for a folio_test_hugetlb_unsafe(), but that
would only remove the warning, and do nothing to fix all the unsafe
usage.  The hugetlb handling code in isolate_migratepages_block()
doesn't seem to have any understanding that it's working with pages
that can change under it.  I can have a go at fixing it up, but maybe
Oscar would prefer to do it?

[1] terribly unlikely, but consider what happens if the page starts out
as a non-hugetlb compound allocation.  Then it is freed and reallocated;
the new owner of the second page could have put enough information
into it that tricked PageHuge() into returning true.  Then we call
isolate_or_dissolve_huge_page() which takes a lock, but doesn't take
a refcount.  Now it gets a bogus hstate and things go downhill from
there ...)
David Hildenbrand Dec. 13, 2023, 12:23 p.m. UTC | #7
On 08.12.23 19:31, Matthew Wilcox wrote:
> On Fri, Dec 08, 2023 at 06:54:19PM +0100, Vlastimil Babka wrote:
>> On 8/16/23 17:11, Matthew Wilcox (Oracle) wrote:
>>> We can use a bit in page[1].flags to indicate that this folio belongs
>>> to hugetlb instead of using a value in page[1].dtors.  That lets
>>> folio_test_hugetlb() become an inline function like it should be.
>>> We can also get rid of NULL_COMPOUND_DTOR.
>>>
>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>
>> I think this (as commit 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR")) is
>> causing the bug reported by Luis here:
>> https://bugzilla.kernel.org/show_bug.cgi?id=218227
> 
> Luis, please stop using bugzilla.  If you'd sent email like a normal
> kernel developer, I'd've seen this bug earlier.
> 
>>> page:000000009006bf10 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x3f8a0 pfn:0x1035c0
>>> flags: 0x17fffc000000000(node=0|zone=2|lastcpupid=0x1ffff)
>>> page_type: 0xffffff7f(buddy)
>>> raw: 017fffc000000000 ffffe704c422f808 ffffe704c41ac008 0000000000000000
>>> raw: 000000000003f8a0 0000000000000005 00000000ffffff7f 0000000000000000
>>> page dumped because: VM_BUG_ON_PAGE(n > 0 && !((__builtin_constant_p(PG_head) && __builtin_constant_p((uintptr_t)(&page->flags) != (uintptr_t)((vo>
>>> ------------[ cut here ]------------
>>> kernel BUG at include/linux/page-flags.h:314!
>>> invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
>>> CPU: 6 PID: 2435641 Comm: md5sum Not tainted 6.6.0-rc5 #2
>>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>>> RIP: 0010:folio_flags+0x65/0x70
>>> Code: a8 40 74 de 48 8b 47 48 a8 01 74 d6 48 83 e8 01 48 39 c7 75 bd eb cb 48 8b 07 a8 40 75 c8 48 c7 c6 d8 a7 c3 89 e8 3b c7 fa ff <0f> 0b 66 0f >
>>> RSP: 0018:ffffad51c0bfb7a8 EFLAGS: 00010246
>>> RAX: 000000000000015f RBX: ffffe704c40d7000 RCX: 0000000000000000
>>> RDX: 0000000000000000 RSI: ffffffff89be8040 RDI: 00000000ffffffff
>>> RBP: 0000000000103600 R08: 0000000000000000 R09: ffffad51c0bfb658
>>> R10: 0000000000000003 R11: ffffffff89eacb08 R12: 0000000000000035
>>> R13: ffffe704c40d7000 R14: 0000000000000000 R15: ffffad51c0bfb930
>>> FS:  00007f350c51b740(0000) GS:ffff9b62fbd80000(0000) knlGS:0000000000000000
>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> CR2: 0000555860919508 CR3: 00000001217fe002 CR4: 0000000000770ee0
>>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>> PKRU: 55555554
>>> Call Trace:
>>>   <TASK>
>>>   ? die+0x32/0x80
>>>   ? do_trap+0xd6/0x100
>>>   ? folio_flags+0x65/0x70
>>>   ? do_error_trap+0x6a/0x90
>>>   ? folio_flags+0x65/0x70
>>>   ? exc_invalid_op+0x4c/0x60
>>>   ? folio_flags+0x65/0x70
>>>   ? asm_exc_invalid_op+0x16/0x20
>>>   ? folio_flags+0x65/0x70
>>>   ? folio_flags+0x65/0x70
>>>   PageHuge+0x67/0x80
>>>   isolate_migratepages_block+0x1c5/0x13b0
>>>   compact_zone+0x746/0xfc0
>>>   compact_zone_order+0xbb/0x100
>>>   try_to_compact_pages+0xf0/0x2f0
>>>   __alloc_pages_direct_compact+0x78/0x210
>>>   __alloc_pages_slowpath.constprop.0+0xac1/0xdb0
>>>   __alloc_pages+0x218/0x240
>>>   folio_alloc+0x17/0x50
>>
>> It's because PageHuge() now does folio_test_hugetlb() which is documented to
>> assume caller holds a reference, but the test in
>> isolate_migratepages_block() doesn't. The code is there from 2021 by Oscar,
>> perhaps it could be changed to take a reference (and e.g. only test
>> PageHead() before), but it will be a bit involved as the
>> isolate_or_dissolve_huge_page() it calls has some logic based on the
>> refcount being zero/non-zero as well. Oscar, what do you think?
>> Also I wonder if any of the the other existing PageHuge() callers are also
>> affected because they might be doing so without a reference.
> 
> I don't think the warning is actually wrong!  We're living very
> dangerously here as PageHuge() could have returned a false positive
> before this change [1].  Then we assume that compound_nr() returns a
> consistent result (and compound_order() might, for example, return a
> value larger than 63, leading to UB).

All this code is racy (as spelled out in some of the code comments), and the
assumption is that races are tolerable. In the worst case, isolation fails on
races.

There is some code in there that sanitizes compound_order() return values :

		/*
		 * Regardless of being on LRU, compound pages such as THP and
		 * hugetlbfs are not to be compacted unless we are attempting
		 * an allocation much larger than the huge page size (eg CMA).
		 * We can potentially save a lot of iterations if we skip them
		 * at once. The check is racy, but we can consider only valid
		 * values and the only danger is skipping too much.
		 */
		if (PageCompound(page) && !cc->alloc_contig) {
			const unsigned int order = compound_order(page);

			if (likely(order <= MAX_ORDER)) {
				low_pfn += (1UL << order) - 1;
				nr_scanned += (1UL << order) - 1;
			}
			goto isolate_fail;
		}

At least isolate_or_dissolve_huge_page() looks like it wants to deal with
concurrent dissolving of hugetlb folios properly. See below, if it is
actually correct.

> 
> I think there's a place for a folio_test_hugetlb_unsafe(), but that
> would only remove the warning, and do nothing to fix all the unsafe
> usage.  The hugetlb handling code in isolate_migratepages_block()
> doesn't seem to have any understanding that it's working with pages
> that can change under it.  

Staring at the code (once again), I think we might miss to sanitize
the compound_nr() return value; but I'm not sure if that is really
problematic; We can get out-of-memory low_pfn either way when we're
operating at the end of memory, memory holes etc.

> I can have a go at fixing it up, but maybe
> Oscar would prefer to do it?
> 
> [1] terribly unlikely, but consider what happens if the page starts out
> as a non-hugetlb compound allocation.  Then it is freed and reallocated;
> the new owner of the second page could have put enough information
> into it that tricked PageHuge() into returning true.  Then we call

I think that got more likely by using a pageflag :)

> isolate_or_dissolve_huge_page() which takes a lock, but doesn't take
> a refcount.  Now it gets a bogus hstate and things go downhill from
> there ...)

hugetlb folios can have a refocunt of 0, in which case they are considered
"free". The recount handling at the end of isolate_or_dissolve_huge_page()
gives a hint that this is how hugetlb operates.

So grabbing references as you're used to from !hugetlb code doesn't work.

That function needs some care, to deal with what you described. Maybe
something like the following (assuming page_folio() is fine):


bool is_hugetlb_folio = false;


/*
  * Especially for free hugetlb folios, we cannot use the recount
  * to stabilize. While holding the hugetlb_lock, no hugetlb folios can
  * be dissolved.
  */
spin_lock_irq(&hugetlb_lock);
folio = page_folio(page);

if (folio_test_hugetlb_unsafe(folio)) {
	/*
          * We might have a false positive. Make sure the folio hasn't
          * changed in the meantime and still is a hugetlb folio.
	 */
	smp_rmb();
	if (folio == page_folio(page) &&
	    folio_test_hugetlb_unsafe(folio))
		is_hugetlb_folio = true;
}

if (is_hugetlb_folio)
	h = folio_hstate(folio);
spin_unlock_irq(&hugetlb_lock);
if (!is_hugetlb_folio)
	return 0;


But devil is in the detail (could someone trick us twice?).

isolate_hugetlb() performs similar lockless checks under
hugetlb_lock, and likely we should just unify them once we figure
out the right thing to do.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index c18d94fa6470..baa1c355741d 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -325,8 +325,8 @@  NR_FREE_PAGES
 On linux-2.6.21 or later, the number of free pages is in
 vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
 
-PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask
-------------------------------------------------------------------------------
+PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|PG_hugetlb
+-----------------------------------------------------------------------------------------
 
 Page attributes. These flags are used to filter various unnecessary for
 dumping pages.
@@ -338,12 +338,6 @@  More page attributes. These flags are used to filter various unnecessary for
 dumping pages.
 
 
-HUGETLB_PAGE_DTOR
------------------
-
-The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
-excludes these pages.
-
 x86_64
 ======
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b800d1298dc..642f5fe5860e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1268,11 +1268,7 @@  void folio_copy(struct folio *dst, struct folio *src);
 unsigned long nr_free_buffer_pages(void);
 
 enum compound_dtor_id {
-	NULL_COMPOUND_DTOR,
 	COMPOUND_PAGE_DTOR,
-#ifdef CONFIG_HUGETLB_PAGE
-	HUGETLB_PAGE_DTOR,
-#endif
 	TRANSHUGE_PAGE_DTOR,
 	NR_COMPOUND_DTORS,
 };
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92a2063a0a23..aeecf0cf1456 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -171,15 +171,6 @@  enum pageflags {
 	/* Remapped by swiotlb-xen. */
 	PG_xen_remapped = PG_owner_priv_1,
 
-#ifdef CONFIG_MEMORY_FAILURE
-	/*
-	 * Compound pages. Stored in first tail page's flags.
-	 * Indicates that at least one subpage is hwpoisoned in the
-	 * THP.
-	 */
-	PG_has_hwpoisoned = PG_error,
-#endif
-
 	/* non-lru isolated movable page */
 	PG_isolated = PG_reclaim,
 
@@ -190,6 +181,15 @@  enum pageflags {
 	/* For self-hosted memmap pages */
 	PG_vmemmap_self_hosted = PG_owner_priv_1,
 #endif
+
+	/*
+	 * Flags only valid for compound pages.  Stored in first tail page's
+	 * flags word.
+	 */
+
+	/* At least one page in this folio has the hwpoison flag set */
+	PG_has_hwpoisoned = PG_error,
+	PG_hugetlb = PG_active,
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -812,7 +812,23 @@  static inline void ClearPageCompound(struct page *page)
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
-bool folio_test_hugetlb(struct folio *folio);
+SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
+CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
+
+/**
+ * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
+ * @folio: The folio to test.
+ *
+ * Context: Any context.  Caller should have a reference on the folio to
+ * prevent it from being turned into a tail page.
+ * Return: True for hugetlbfs folios, false for anon folios or folios
+ * belonging to other filesystems.
+ */
+static inline bool folio_test_hugetlb(struct folio *folio)
+{
+	return folio_test_large(folio) &&
+		test_bit(PG_hugetlb, folio_flags(folio, 1));
+}
 #else
 TESTPAGEFLAG_FALSE(Huge, hugetlb)
 #endif
@@ -1040,6 +1056,13 @@  static __always_inline void __ClearPageAnonExclusive(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_PREP	\
 	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
 
+/*
+ * Flags stored in the second page of a compound page.  They may overlap
+ * the CHECK_AT_FREE flags above, so need to be cleared.
+ */
+#define PAGE_FLAGS_SECOND						\
+	(1UL << PG_has_hwpoisoned	| 1UL << PG_hugetlb)
+
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
 /**
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 90ce1dfd591c..dd5f87047d06 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -490,7 +490,7 @@  static int __init crash_save_vmcoreinfo_init(void)
 #define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
 #ifdef CONFIG_HUGETLB_PAGE
-	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+	VMCOREINFO_NUMBER(PG_hugetlb);
 #define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
 	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
 #endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 086eb51bf845..389490f100b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1585,25 +1585,7 @@  static inline void __clear_hugetlb_destructor(struct hstate *h,
 {
 	lockdep_assert_held(&hugetlb_lock);
 
-	/*
-	 * Very subtle
-	 *
-	 * For non-gigantic pages set the destructor to the normal compound
-	 * page dtor.  This is needed in case someone takes an additional
-	 * temporary ref to the page, and freeing is delayed until they drop
-	 * their reference.
-	 *
-	 * For gigantic pages set the destructor to the null dtor.  This
-	 * destructor will never be called.  Before freeing the gigantic
-	 * page destroy_compound_gigantic_folio will turn the folio into a
-	 * simple group of pages.  After this the destructor does not
-	 * apply.
-	 *
-	 */
-	if (hstate_is_gigantic(h))
-		folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
-	else
-		folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+	folio_clear_hugetlb(folio);
 }
 
 /*
@@ -1690,7 +1672,7 @@  static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
 		h->surplus_huge_pages_node[nid]++;
 	}
 
-	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+	folio_set_hugetlb(folio);
 	folio_change_private(folio, NULL);
 	/*
 	 * We have to set hugetlb_vmemmap_optimized again as above
@@ -1814,9 +1796,8 @@  static void free_hpage_workfn(struct work_struct *work)
 		/*
 		 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
 		 * folio_hstate() is going to trigger because a previous call to
-		 * remove_hugetlb_folio() will call folio_set_compound_dtor
-		 * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate()
-		 * directly.
+		 * remove_hugetlb_folio() will clear the hugetlb bit, so do
+		 * not use folio_hstate() directly.
 		 */
 		h = size_to_hstate(page_size(page));
 
@@ -1955,7 +1936,7 @@  static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
 {
 	hugetlb_vmemmap_optimize(h, &folio->page);
 	INIT_LIST_HEAD(&folio->lru);
-	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+	folio_set_hugetlb(folio);
 	hugetlb_set_folio_subpool(folio, NULL);
 	set_hugetlb_cgroup(folio, NULL);
 	set_hugetlb_cgroup_rsvd(folio, NULL);
@@ -2070,28 +2051,10 @@  int PageHuge(struct page *page)
 	if (!PageCompound(page))
 		return 0;
 	folio = page_folio(page);
-	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
+	return folio_test_hugetlb(folio);
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
-/**
- * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
- * @folio: The folio to test.
- *
- * Context: Any context.  Caller should have a reference on the folio to
- * prevent it from being turned into a tail page.
- * Return: True for hugetlbfs folios, false for anon folios or folios
- * belonging to other filesystems.
- */
-bool folio_test_hugetlb(struct folio *folio)
-{
-	if (!folio_test_large(folio))
-		return false;
-
-	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
-}
-EXPORT_SYMBOL_GPL(folio_test_hugetlb);
-
 /*
  * Find and lock address space (mapping) in write mode.
  *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9638fdddf065..f8e276de4fd5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1122,7 +1122,7 @@  static __always_inline bool free_pages_prepare(struct page *page,
 		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 
 		if (compound)
-			ClearPageHasHWPoisoned(page);
+			page[1].flags &= ~PAGE_FLAGS_SECOND;
 		for (i = 1; i < (1 << order); i++) {
 			if (compound)
 				bad += free_tail_page_prepare(page, page + i);