diff mbox series

[27/27] mm/hugetlb: enable bootmem allocation from CMA areas

Message ID 20250127232207.3888640-28-fvdl@google.com (mailing list archive)
State New
Headers show
Series hugetlb/CMA improvements for large systems | expand

Commit Message

Frank van der Linden Jan. 27, 2025, 11:22 p.m. UTC
If hugetlb_cma_only is enabled, we know that hugetlb pages
can only be allocated from CMA. Now that there is an interface
to do early reservations from a CMA area (returning memblock
memory), it can be used to allocate hugetlb pages from CMA.

This also allows for doing pre-HVO on these pages (if enabled).

Make sure to initialize the page structures and associated data
correctly. Create a flag to signal that a hugetlb page has been
allocated from CMA to make things a little easier.

Some configurations of powerpc have a special hugetlb bootmem
allocator, so introduce a boolean arch_specific_huge_bootmem_alloc
that returns true if such an allocator is present. In that case,
CMA bootmem allocations can't be used, so check that function
before trying.

Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Frank van der Linden <fvdl@google.com>
---
 arch/powerpc/mm/hugetlbpage.c |   5 ++
 include/linux/hugetlb.h       |   7 ++
 mm/hugetlb.c                  | 135 +++++++++++++++++++++++++---------
 3 files changed, 114 insertions(+), 33 deletions(-)

Comments

Christophe Leroy Jan. 28, 2025, 8:55 a.m. UTC | #1
Le 28/01/2025 à 00:22, Frank van der Linden a écrit :
> If hugetlb_cma_only is enabled, we know that hugetlb pages
> can only be allocated from CMA. Now that there is an interface
> to do early reservations from a CMA area (returning memblock
> memory), it can be used to allocate hugetlb pages from CMA.
> 
> This also allows for doing pre-HVO on these pages (if enabled).
> 
> Make sure to initialize the page structures and associated data
> correctly. Create a flag to signal that a hugetlb page has been
> allocated from CMA to make things a little easier.
> 
> Some configurations of powerpc have a special hugetlb bootmem
> allocator, so introduce a boolean arch_specific_huge_bootmem_alloc
> that returns true if such an allocator is present. In that case,
> CMA bootmem allocations can't be used, so check that function
> before trying.
> 
> Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Frank van der Linden <fvdl@google.com>
> ---
>   arch/powerpc/mm/hugetlbpage.c |   5 ++
>   include/linux/hugetlb.h       |   7 ++
>   mm/hugetlb.c                  | 135 +++++++++++++++++++++++++---------
>   3 files changed, 114 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index d3c1b749dcfc..e53e4b4c8ef6 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -121,6 +121,11 @@ bool __init hugetlb_node_alloc_supported(void)
>   {
>   	return false;
>   }
> +
> +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h)
> +{
> +	return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
> +}
>   #endif
>   
>   
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 2512463bca49..bca3052fb175 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -591,6 +591,7 @@ enum hugetlb_page_flags {
>   	HPG_freed,
>   	HPG_vmemmap_optimized,
>   	HPG_raw_hwp_unreliable,
> +	HPG_cma,
>   	__NR_HPAGEFLAGS,
>   };
>   
> @@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary)
>   HPAGEFLAG(Freed, freed)
>   HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
>   HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
> +HPAGEFLAG(Cma, cma)
>   
>   #ifdef CONFIG_HUGETLB_PAGE
>   
> @@ -678,14 +680,18 @@ struct hstate {
>   	char name[HSTATE_NAME_LEN];
>   };
>   
> +struct cma;
> +
>   struct huge_bootmem_page {
>   	struct list_head list;
>   	struct hstate *hstate;
>   	unsigned long flags;
> +	struct cma *cma;
>   };
>   
>   #define HUGE_BOOTMEM_HVO		0x0001
>   #define HUGE_BOOTMEM_ZONES_VALID	0x0002
> +#define HUGE_BOOTMEM_CMA		0x0004
>   
>   bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
>   
> @@ -711,6 +717,7 @@ bool __init hugetlb_node_alloc_supported(void);
>   
>   void __init hugetlb_add_hstate(unsigned order);
>   bool __init arch_hugetlb_valid_size(unsigned long size);
> +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h);

Why adding 'specific' in the name ? Prefixing a function name with arch_ 
is usually enough to denote an architecture specific function.

>   struct hstate *size_to_hstate(unsigned long size);
>   
>   #ifndef HUGE_MAX_HSTATE
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 32ebde9039e2..183e8d0c2fb4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -61,7 +61,7 @@ static struct cma *hugetlb_cma[MAX_NUMNODES];
>   static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
>   #endif
>   static bool hugetlb_cma_only;
> -static unsigned long hugetlb_cma_size __initdata;
> +static unsigned long hugetlb_cma_size;

Why remove __initdata ? As far as I can see it is used only in 
hugetlb_early_cma() and hugetlb_hstate_alloc_pages() which are __init 
functions.

>   
>   __initdata struct list_head huge_boot_pages[MAX_NUMNODES];
>   __initdata unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE];
> @@ -132,8 +132,10 @@ static void hugetlb_free_folio(struct folio *folio)
>   #ifdef CONFIG_CMA
>   	int nid = folio_nid(folio);
>   
> -	if (cma_free_folio(hugetlb_cma[nid], folio))
> +	if (folio_test_hugetlb_cma(folio)) {
> +		WARN_ON(!cma_free_folio(hugetlb_cma[nid], folio));

Is that WARN_ON() needed ? See 
https://docs.kernel.org/process/coding-style.html#do-not-crash-the-kernel


>   		return;
> +	}
>   #endif
>   	folio_put(folio);
>   }
> @@ -1509,6 +1511,9 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
>   					break;
>   			}
>   		}
> +
> +		if (folio)
> +			folio_set_hugetlb_cma(folio);
>   	}
>   #endif
>   	if (!folio) {
> @@ -3175,6 +3180,63 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>   	return ERR_PTR(-ENOSPC);
>   }
>   
> +/*
> + * Some architectures do their own bootmem allocation, so they can't use
> + * early CMA allocation. So, allow for this function to be redefined.
> + */
> +bool __init __attribute((weak))

Can't you use __weak ?

By the way, do we really need a weak function here ? Can't it be a 
static inline helper that gets waived by a macro defined by the arch, 
something like:

#ifndef arch_huge_bootmem_alloc
static inline arch_huge_bootmem_alloc(struct hstate *h)
{
	return false;
}
#endif

Then powerpc does:

#define arch_huge_bootmem_alloc arch_huge_bootmem_alloc
static inline arch_huge_bootmem_alloc(struct hstate *h)
{
	return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
}


But why is struct hstate *h parameter needed ? Seems like noone uses it.

> +arch_specific_huge_bootmem_alloc(struct hstate *h)
> +{
> +	return false;
> +}
> +
> +static bool __init hugetlb_early_cma(struct hstate *h)
> +{
> +	if (arch_specific_huge_bootmem_alloc(h))
> +		return false;
> +
> +	return (hstate_is_gigantic(h) && hugetlb_cma_size && hugetlb_cma_only);
> +}
> +
> +static __init void *alloc_bootmem(struct hstate *h, int nid)
> +{
> +	struct huge_bootmem_page *m;
> +	unsigned long flags;
> +	struct cma *cma;
> +
> +#ifdef CONFIG_CMA

  #ifdefs in C files should be avoided, see 
https://docs.kernel.org/process/coding-style.html#conditional-compilation

> +	if (hugetlb_early_cma(h)) {
> +		flags = HUGE_BOOTMEM_CMA;
> +		cma = hugetlb_cma[nid];
> +		m = cma_reserve_early(cma, huge_page_size(h));
> +	} else
> +#endif

This kind of if/else construct in uggly, should be avoided.

> +	{
> +		flags = 0;
> +		cma = NULL;
> +		m = memblock_alloc_try_nid_raw(huge_page_size(h),
> +			huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
> +	}
> +
> +	if (m) {
> +		/*
> +		 * Use the beginning of the huge page to store the
> +		 * huge_bootmem_page struct (until gather_bootmem
> +		 * puts them into the mem_map).
> +		 *
> +		 * Put them into a private list first because mem_map
> +		 * is not up yet.
> +		 */
> +		INIT_LIST_HEAD(&m->list);
> +		list_add(&m->list, &huge_boot_pages[nid]);
> +		m->hstate = h;
> +		m->flags = flags;
> +		m->cma = cma;
> +	}
> +
> +	return m;
> +}
> +
>   int alloc_bootmem_huge_page(struct hstate *h, int nid)
>   	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
>   int __alloc_bootmem_huge_page(struct hstate *h, int nid)
> @@ -3184,17 +3246,14 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
>   
>   	/* do node specific alloc */
>   	if (nid != NUMA_NO_NODE) {
> -		m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
> -				0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
> +		m = alloc_bootmem(h, node);
>   		if (!m)
>   			return 0;
>   		goto found;
>   	}
>   	/* allocate from next node when distributing huge pages */
>   	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
> -		m = memblock_alloc_try_nid_raw(
> -				huge_page_size(h), huge_page_size(h),
> -				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
> +		m = alloc_bootmem(h, node);
>   		if (m)
>   			break;
>   	}
> @@ -3203,7 +3262,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
>   		return 0;
>   
>   found:
> -
>   	/*
>   	 * Only initialize the head struct page in memmap_init_reserved_pages,
>   	 * rest of the struct pages will be initialized by the HugeTLB
> @@ -3213,18 +3271,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
>   	 */
>   	memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
>   		huge_page_size(h) - PAGE_SIZE);
> -	/*
> -	 * Use the beginning of the huge page to store the
> -	 * huge_bootmem_page struct (until gather_bootmem
> -	 * puts them into the mem_map).
> -	 *
> -	 * Put them into a private list first because mem_map
> -	 * is not up yet.
> -	 */
> -	INIT_LIST_HEAD(&m->list);
> -	list_add(&m->list, &huge_boot_pages[node]);
> -	m->hstate = h;
> -	m->flags = 0;
>   	return 1;
>   }
>   
> @@ -3265,13 +3311,25 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
>   	prep_compound_head((struct page *)folio, huge_page_order(h));
>   }
>   
> +static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
> +{
> +	return (m->flags & HUGE_BOOTMEM_HVO);

Parenthesis are superflous

> +}
> +
> +static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
> +{
> +	return (m->flags & HUGE_BOOTMEM_CMA);

Parenthesis are superflous

> +}
> +
>   /*
>    * memblock-allocated pageblocks might not have the migrate type set
>    * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
> - * here.
> + * here, or MIGRATE_CMA if this was a page allocated through an early CMA
> + * reservation.
>    *
> - * Note that this will not write the page struct, it is ok (and necessary)
> - * to do this on vmemmap optimized folios.
> + * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
> + * read-only, but that's ok - for sparse vmemmap this does not write to
> + * the page structure.
>    */
>   static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
>   							  struct hstate *h)
> @@ -3280,9 +3338,13 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
>   
>   	WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
>   
> -	for (i = 0; i < nr_pages; i += pageblock_nr_pages)
> -		set_pageblock_migratetype(folio_page(folio, i),
> +	for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
> +		if (folio_test_hugetlb_cma(folio))
> +			init_cma_pageblock(folio_page(folio, i));
> +		else
> +			set_pageblock_migratetype(folio_page(folio, i),
>   					  MIGRATE_MOVABLE);
> +	}
>   }
>   
>   static void __init prep_and_add_bootmem_folios(struct hstate *h,
> @@ -3319,7 +3381,7 @@ bool __init hugetlb_bootmem_page_zones_valid(int nid,
>   					     struct huge_bootmem_page *m)
>   {
>   	unsigned long start_pfn;
> -	bool valid;
> +	bool valid = false;

Why do you need that, I can't see any path to reach out: without setting 
'valid'.

>   
>   	if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
>   		/*
> @@ -3328,10 +3390,16 @@ bool __init hugetlb_bootmem_page_zones_valid(int nid,
>   		return true;
>   	}
>   
> +	if (hugetlb_bootmem_page_earlycma(m)) {
> +		valid = cma_validate_zones(m->cma);
> +		goto out;
> +	}
> +
>   	start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
>   
>   	valid = !pfn_range_intersects_zones(nid, start_pfn,
>   			pages_per_huge_page(m->hstate));
> +out:
>   	if (!valid)
>   		hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
>   
> @@ -3360,11 +3428,6 @@ static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
>   	}
>   }
>   
> -static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
> -{
> -	return (m->flags & HUGE_BOOTMEM_HVO);
> -}
> -
>   /*
>    * Put bootmem huge pages into the standard lists after mem_map is up.
>    * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
> @@ -3414,6 +3477,9 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
>   			 */
>   			folio_set_hugetlb_vmemmap_optimized(folio);
>   
> +		if (hugetlb_bootmem_page_earlycma(m))
> +			folio_set_hugetlb_cma(folio);
> +
>   		list_add(&folio->lru, &folio_list);
>   
>   		/*
> @@ -3606,8 +3672,11 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
>   {
>   	unsigned long allocated;
>   
> -	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
> -	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
> +	/*
> +	 * Skip gigantic hugepages allocation if early CMA
> +	 * reservations are not available.
> +	 */
> +	if (hstate_is_gigantic(h) && hugetlb_cma_size && !hugetlb_early_cma(h)) {
>   		pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
>   		return;
>   	}
Frank van der Linden Jan. 28, 2025, 5:52 p.m. UTC | #2
Hi Christophe, thanks for your comments. Replies inline below.

On Tue, Jan 28, 2025 at 12:55 AM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
>
>
> Le 28/01/2025 à 00:22, Frank van der Linden a écrit :
> > If hugetlb_cma_only is enabled, we know that hugetlb pages
> > can only be allocated from CMA. Now that there is an interface
> > to do early reservations from a CMA area (returning memblock
> > memory), it can be used to allocate hugetlb pages from CMA.
> >
> > This also allows for doing pre-HVO on these pages (if enabled).
> >
> > Make sure to initialize the page structures and associated data
> > correctly. Create a flag to signal that a hugetlb page has been
> > allocated from CMA to make things a little easier.
> >
> > Some configurations of powerpc have a special hugetlb bootmem
> > allocator, so introduce a boolean arch_specific_huge_bootmem_alloc
> > that returns true if such an allocator is present. In that case,
> > CMA bootmem allocations can't be used, so check that function
> > before trying.
> >
> > Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Signed-off-by: Frank van der Linden <fvdl@google.com>
> > ---
> >   arch/powerpc/mm/hugetlbpage.c |   5 ++
> >   include/linux/hugetlb.h       |   7 ++
> >   mm/hugetlb.c                  | 135 +++++++++++++++++++++++++---------
> >   3 files changed, 114 insertions(+), 33 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> > index d3c1b749dcfc..e53e4b4c8ef6 100644
> > --- a/arch/powerpc/mm/hugetlbpage.c
> > +++ b/arch/powerpc/mm/hugetlbpage.c
> > @@ -121,6 +121,11 @@ bool __init hugetlb_node_alloc_supported(void)
> >   {
> >       return false;
> >   }
> > +
> > +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h)
> > +{
> > +     return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
> > +}
> >   #endif
> >
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 2512463bca49..bca3052fb175 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -591,6 +591,7 @@ enum hugetlb_page_flags {
> >       HPG_freed,
> >       HPG_vmemmap_optimized,
> >       HPG_raw_hwp_unreliable,
> > +     HPG_cma,
> >       __NR_HPAGEFLAGS,
> >   };
> >
> > @@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary)
> >   HPAGEFLAG(Freed, freed)
> >   HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
> >   HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
> > +HPAGEFLAG(Cma, cma)
> >
> >   #ifdef CONFIG_HUGETLB_PAGE
> >
> > @@ -678,14 +680,18 @@ struct hstate {
> >       char name[HSTATE_NAME_LEN];
> >   };
> >
> > +struct cma;
> > +
> >   struct huge_bootmem_page {
> >       struct list_head list;
> >       struct hstate *hstate;
> >       unsigned long flags;
> > +     struct cma *cma;
> >   };
> >
> >   #define HUGE_BOOTMEM_HVO            0x0001
> >   #define HUGE_BOOTMEM_ZONES_VALID    0x0002
> > +#define HUGE_BOOTMEM_CMA             0x0004
> >
> >   bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
> >
> > @@ -711,6 +717,7 @@ bool __init hugetlb_node_alloc_supported(void);
> >
> >   void __init hugetlb_add_hstate(unsigned order);
> >   bool __init arch_hugetlb_valid_size(unsigned long size);
> > +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h);
>
> Why adding 'specific' in the name ? Prefixing a function name with arch_
> is usually enough to denote an architecture specific function.

True, yes. That should probably be arch_has_huge_bootmem_alloc, I will
change that.

>
> >   struct hstate *size_to_hstate(unsigned long size);
> >
> >   #ifndef HUGE_MAX_HSTATE
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 32ebde9039e2..183e8d0c2fb4 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -61,7 +61,7 @@ static struct cma *hugetlb_cma[MAX_NUMNODES];
> >   static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
> >   #endif
> >   static bool hugetlb_cma_only;
> > -static unsigned long hugetlb_cma_size __initdata;
> > +static unsigned long hugetlb_cma_size;
>
> Why remove __initdata ? As far as I can see it is used only in
> hugetlb_early_cma() and hugetlb_hstate_alloc_pages() which are __init
> functions.

hugetlb_cma_size is now used in alloc_gigantic_folio(), which is not
an __init function. However, you got me thinking: since
hugetlb_cma_only is only effective when hugetlb_cma_size != 0, I can
just reset hugetlb_cma_only to false if hugetlb_cma_size == 0 after
parsing the commandline arguments. This will revert hugetlb_cma_size
to __initdata, and simplifies things a bit. I'll make that change in
v2.
>
> >
> >   __initdata struct list_head huge_boot_pages[MAX_NUMNODES];
> >   __initdata unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE];
> > @@ -132,8 +132,10 @@ static void hugetlb_free_folio(struct folio *folio)
> >   #ifdef CONFIG_CMA
> >       int nid = folio_nid(folio);
> >
> > -     if (cma_free_folio(hugetlb_cma[nid], folio))
> > +     if (folio_test_hugetlb_cma(folio)) {
> > +             WARN_ON(!cma_free_folio(hugetlb_cma[nid], folio));
>
> Is that WARN_ON() needed ? See
> https://docs.kernel.org/process/coding-style.html#do-not-crash-the-kernel

Not strictly, I suppose, but if there is a CMA-allocated hugetlb
folio, and cma_free fails, that would be a condition worthy of a
warning, as the flag somehow got corrupted or there is an internal CMA
error.  How about WARN_ON_ONCE?

>
>
> >               return;
> > +     }
> >   #endif
> >       folio_put(folio);
> >   }
> > @@ -1509,6 +1511,9 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
> >                                       break;
> >                       }
> >               }
> > +
> > +             if (folio)
> > +                     folio_set_hugetlb_cma(folio);
> >       }
> >   #endif
> >       if (!folio) {
> > @@ -3175,6 +3180,63 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> >       return ERR_PTR(-ENOSPC);
> >   }
> >
> > +/*
> > + * Some architectures do their own bootmem allocation, so they can't use
> > + * early CMA allocation. So, allow for this function to be redefined.
> > + */
> > +bool __init __attribute((weak))
>
> Can't you use __weak ?
>
> By the way, do we really need a weak function here ? Can't it be a
> static inline helper that gets waived by a macro defined by the arch,
> something like:
>
> #ifndef arch_huge_bootmem_alloc
> static inline arch_huge_bootmem_alloc(struct hstate *h)
> {
>         return false;
> }
> #endif
>
> Then powerpc does:
>
> #define arch_huge_bootmem_alloc arch_huge_bootmem_alloc
> static inline arch_huge_bootmem_alloc(struct hstate *h)
> {
>         return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
> }
>

Fair enough, yeah. I used a weak symbol because that was already used
for powerpc with alloc_bootmem_huge_page(), but I can change this one.

>
> But why is struct hstate *h parameter needed ? Seems like noone uses it.

Correct - I merely extrapolated a bit and thought "well, architectures
might have bootmem hugetlb allocators that only deal with certain
sizes". But then again, like you say, there is currently no need for
it. I'll remove the argument.

>
> > +arch_specific_huge_bootmem_alloc(struct hstate *h)
> > +{
> > +     return false;
> > +}
> > +
> > +static bool __init hugetlb_early_cma(struct hstate *h)
> > +{
> > +     if (arch_specific_huge_bootmem_alloc(h))
> > +             return false;
> > +
> > +     return (hstate_is_gigantic(h) && hugetlb_cma_size && hugetlb_cma_only);
> > +}
> > +
> > +static __init void *alloc_bootmem(struct hstate *h, int nid)
> > +{
> > +     struct huge_bootmem_page *m;
> > +     unsigned long flags;
> > +     struct cma *cma;
> > +
> > +#ifdef CONFIG_CMA
>
>   #ifdefs in C files should be avoided, see
> https://docs.kernel.org/process/coding-style.html#conditional-compilation
>
> > +     if (hugetlb_early_cma(h)) {
> > +             flags = HUGE_BOOTMEM_CMA;
> > +             cma = hugetlb_cma[nid];
> > +             m = cma_reserve_early(cma, huge_page_size(h));
> > +     } else
> > +#endif
>
> This kind of if/else construct in uggly, should be avoided.
>

I found this ifdef hard to avoid, sadly, I tried various ways to avoid
it (If (IS_ENABLED(CONFIG_CMA), etc), but came up short. I'll have
another look for v2, but short of trying to split off all CMA-related
code in to a different file, which would definitely be out of scope
here, it might not end up being better.

> > +     {
> > +             flags = 0;
> > +             cma = NULL;
> > +             m = memblock_alloc_try_nid_raw(huge_page_size(h),
> > +                     huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
> > +     }
> > +
> > +     if (m) {
> > +             /*
> > +              * Use the beginning of the huge page to store the
> > +              * huge_bootmem_page struct (until gather_bootmem
> > +              * puts them into the mem_map).
> > +              *
> > +              * Put them into a private list first because mem_map
> > +              * is not up yet.
> > +              */
> > +             INIT_LIST_HEAD(&m->list);
> > +             list_add(&m->list, &huge_boot_pages[nid]);
> > +             m->hstate = h;
> > +             m->flags = flags;
> > +             m->cma = cma;
> > +     }
> > +
> > +     return m;
> > +}
> > +
> >   int alloc_bootmem_huge_page(struct hstate *h, int nid)
> >       __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> >   int __alloc_bootmem_huge_page(struct hstate *h, int nid)
> > @@ -3184,17 +3246,14 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
> >
> >       /* do node specific alloc */
> >       if (nid != NUMA_NO_NODE) {
> > -             m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
> > -                             0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
> > +             m = alloc_bootmem(h, node);
> >               if (!m)
> >                       return 0;
> >               goto found;
> >       }
> >       /* allocate from next node when distributing huge pages */
> >       for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
> > -             m = memblock_alloc_try_nid_raw(
> > -                             huge_page_size(h), huge_page_size(h),
> > -                             0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
> > +             m = alloc_bootmem(h, node);
> >               if (m)
> >                       break;
> >       }
> > @@ -3203,7 +3262,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
> >               return 0;
> >
> >   found:
> > -
> >       /*
> >        * Only initialize the head struct page in memmap_init_reserved_pages,
> >        * rest of the struct pages will be initialized by the HugeTLB
> > @@ -3213,18 +3271,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
> >        */
> >       memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
> >               huge_page_size(h) - PAGE_SIZE);
> > -     /*
> > -      * Use the beginning of the huge page to store the
> > -      * huge_bootmem_page struct (until gather_bootmem
> > -      * puts them into the mem_map).
> > -      *
> > -      * Put them into a private list first because mem_map
> > -      * is not up yet.
> > -      */
> > -     INIT_LIST_HEAD(&m->list);
> > -     list_add(&m->list, &huge_boot_pages[node]);
> > -     m->hstate = h;
> > -     m->flags = 0;
> >       return 1;
> >   }
> >
> > @@ -3265,13 +3311,25 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
> >       prep_compound_head((struct page *)folio, huge_page_order(h));
> >   }
> >
> > +static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
> > +{
> > +     return (m->flags & HUGE_BOOTMEM_HVO);
>
> Parenthesis are superflous
>
> > +}
> > +
> > +static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
> > +{
> > +     return (m->flags & HUGE_BOOTMEM_CMA);
>
> Parenthesis are superflous
>

Sure, will remove them.

> > +}
> > +
> >   /*
> >    * memblock-allocated pageblocks might not have the migrate type set
> >    * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
> > - * here.
> > + * here, or MIGRATE_CMA if this was a page allocated through an early CMA
> > + * reservation.
> >    *
> > - * Note that this will not write the page struct, it is ok (and necessary)
> > - * to do this on vmemmap optimized folios.
> > + * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
> > + * read-only, but that's ok - for sparse vmemmap this does not write to
> > + * the page structure.
> >    */
> >   static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
> >                                                         struct hstate *h)
> > @@ -3280,9 +3338,13 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
> >
> >       WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
> >
> > -     for (i = 0; i < nr_pages; i += pageblock_nr_pages)
> > -             set_pageblock_migratetype(folio_page(folio, i),
> > +     for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
> > +             if (folio_test_hugetlb_cma(folio))
> > +                     init_cma_pageblock(folio_page(folio, i));
> > +             else
> > +                     set_pageblock_migratetype(folio_page(folio, i),
> >                                         MIGRATE_MOVABLE);
> > +     }
> >   }
> >
> >   static void __init prep_and_add_bootmem_folios(struct hstate *h,
> > @@ -3319,7 +3381,7 @@ bool __init hugetlb_bootmem_page_zones_valid(int nid,
> >                                            struct huge_bootmem_page *m)
> >   {
> >       unsigned long start_pfn;
> > -     bool valid;
> > +     bool valid = false;
>
> Why do you need that, I can't see any path to reach out: without setting
> 'valid'.

True - probably a leftover from an earlier iteration, I can remove that.
>
> >
> >       if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
> >               /*
> > @@ -3328,10 +3390,16 @@ bool __init hugetlb_bootmem_page_zones_valid(int nid,
> >               return true;
> >       }
> >
> > +     if (hugetlb_bootmem_page_earlycma(m)) {
> > +             valid = cma_validate_zones(m->cma);
> > +             goto out;
> > +     }
> > +
> >       start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
> >
> >       valid = !pfn_range_intersects_zones(nid, start_pfn,
> >                       pages_per_huge_page(m->hstate));
> > +out:
> >       if (!valid)
> >               hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
> >
> > @@ -3360,11 +3428,6 @@ static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
> >       }
> >   }
> >
> > -static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
> > -{
> > -     return (m->flags & HUGE_BOOTMEM_HVO);
> > -}
> > -
> >   /*
> >    * Put bootmem huge pages into the standard lists after mem_map is up.
> >    * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
> > @@ -3414,6 +3477,9 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
> >                        */
> >                       folio_set_hugetlb_vmemmap_optimized(folio);
> >
> > +             if (hugetlb_bootmem_page_earlycma(m))
> > +                     folio_set_hugetlb_cma(folio);
> > +
> >               list_add(&folio->lru, &folio_list);
> >
> >               /*
> > @@ -3606,8 +3672,11 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
> >   {
> >       unsigned long allocated;
> >
> > -     /* skip gigantic hugepages allocation if hugetlb_cma enabled */
> > -     if (hstate_is_gigantic(h) && hugetlb_cma_size) {
> > +     /*
> > +      * Skip gigantic hugepages allocation if early CMA
> > +      * reservations are not available.
> > +      */
> > +     if (hstate_is_gigantic(h) && hugetlb_cma_size && !hugetlb_early_cma(h)) {
> >               pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
> >               return;
> >       }
>
Frank van der Linden Jan. 29, 2025, 10:43 p.m. UTC | #3
On Tue, Jan 28, 2025 at 9:52 AM Frank van der Linden <fvdl@google.com> wrote:
>
> Hi Christophe, thanks for your comments. Replies inline below.
>
> On Tue, Jan 28, 2025 at 12:55 AM Christophe Leroy
> <christophe.leroy@csgroup.eu> wrote:
> >
> >
> >
> > Le 28/01/2025 à 00:22, Frank van der Linden a écrit :
> > > If hugetlb_cma_only is enabled, we know that hugetlb pages
> > > can only be allocated from CMA. Now that there is an interface
> > > to do early reservations from a CMA area (returning memblock
> > > memory), it can be used to allocate hugetlb pages from CMA.
> > >
> > > This also allows for doing pre-HVO on these pages (if enabled).
> > >
> > > Make sure to initialize the page structures and associated data
> > > correctly. Create a flag to signal that a hugetlb page has been
> > > allocated from CMA to make things a little easier.
> > >
> > > Some configurations of powerpc have a special hugetlb bootmem
> > > allocator, so introduce a boolean arch_specific_huge_bootmem_alloc
> > > that returns true if such an allocator is present. In that case,
> > > CMA bootmem allocations can't be used, so check that function
> > > before trying.
> > >
> > > Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> > > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > > Cc: linuxppc-dev@lists.ozlabs.org
> > > Signed-off-by: Frank van der Linden <fvdl@google.com>
> > > ---
> > >   arch/powerpc/mm/hugetlbpage.c |   5 ++
> > >   include/linux/hugetlb.h       |   7 ++
> > >   mm/hugetlb.c                  | 135 +++++++++++++++++++++++++---------
> > >   3 files changed, 114 insertions(+), 33 deletions(-)
> > >
> > > diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> > > index d3c1b749dcfc..e53e4b4c8ef6 100644
> > > --- a/arch/powerpc/mm/hugetlbpage.c
> > > +++ b/arch/powerpc/mm/hugetlbpage.c
> > > @@ -121,6 +121,11 @@ bool __init hugetlb_node_alloc_supported(void)
> > >   {
> > >       return false;
> > >   }
> > > +
> > > +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h)
> > > +{
> > > +     return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
> > > +}
> > >   #endif
> > >
> > >
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index 2512463bca49..bca3052fb175 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -591,6 +591,7 @@ enum hugetlb_page_flags {
> > >       HPG_freed,
> > >       HPG_vmemmap_optimized,
> > >       HPG_raw_hwp_unreliable,
> > > +     HPG_cma,
> > >       __NR_HPAGEFLAGS,
> > >   };
> > >
> > > @@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary)
> > >   HPAGEFLAG(Freed, freed)
> > >   HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
> > >   HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
> > > +HPAGEFLAG(Cma, cma)
> > >
> > >   #ifdef CONFIG_HUGETLB_PAGE
> > >
> > > @@ -678,14 +680,18 @@ struct hstate {
> > >       char name[HSTATE_NAME_LEN];
> > >   };
> > >
> > > +struct cma;
> > > +
> > >   struct huge_bootmem_page {
> > >       struct list_head list;
> > >       struct hstate *hstate;
> > >       unsigned long flags;
> > > +     struct cma *cma;
> > >   };
> > >
> > >   #define HUGE_BOOTMEM_HVO            0x0001
> > >   #define HUGE_BOOTMEM_ZONES_VALID    0x0002
> > > +#define HUGE_BOOTMEM_CMA             0x0004
> > >
> > >   bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
> > >
> > > @@ -711,6 +717,7 @@ bool __init hugetlb_node_alloc_supported(void);
> > >
> > >   void __init hugetlb_add_hstate(unsigned order);
> > >   bool __init arch_hugetlb_valid_size(unsigned long size);
> > > +bool __init arch_specific_huge_bootmem_alloc(struct hstate *h);
> >
> > Why adding 'specific' in the name ? Prefixing a function name with arch_
> > is usually enough to denote an architecture specific function.
>
> True, yes. That should probably be arch_has_huge_bootmem_alloc, I will
> change that.
>
> >
> > >   struct hstate *size_to_hstate(unsigned long size);
> > >
> > >   #ifndef HUGE_MAX_HSTATE
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index 32ebde9039e2..183e8d0c2fb4 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -61,7 +61,7 @@ static struct cma *hugetlb_cma[MAX_NUMNODES];
> > >   static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
> > >   #endif
> > >   static bool hugetlb_cma_only;
> > > -static unsigned long hugetlb_cma_size __initdata;
> > > +static unsigned long hugetlb_cma_size;
> >
> > Why remove __initdata ? As far as I can see it is used only in
> > hugetlb_early_cma() and hugetlb_hstate_alloc_pages() which are __init
> > functions.
>
> hugetlb_cma_size is now used in alloc_gigantic_folio(), which is not
> an __init function. However, you got me thinking: since
> hugetlb_cma_only is only effective when hugetlb_cma_size != 0, I can
> just reset hugetlb_cma_only to false if hugetlb_cma_size == 0 after
> parsing the commandline arguments. This will revert hugetlb_cma_size
> to __initdata, and simplifies things a bit. I'll make that change in
> v2.
> >
> > >
> > >   __initdata struct list_head huge_boot_pages[MAX_NUMNODES];
> > >   __initdata unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE];
> > > @@ -132,8 +132,10 @@ static void hugetlb_free_folio(struct folio *folio)
> > >   #ifdef CONFIG_CMA
> > >       int nid = folio_nid(folio);
> > >
> > > -     if (cma_free_folio(hugetlb_cma[nid], folio))
> > > +     if (folio_test_hugetlb_cma(folio)) {
> > > +             WARN_ON(!cma_free_folio(hugetlb_cma[nid], folio));
> >
> > Is that WARN_ON() needed ? See
> > https://docs.kernel.org/process/coding-style.html#do-not-crash-the-kernel
>
> Not strictly, I suppose, but if there is a CMA-allocated hugetlb
> folio, and cma_free fails, that would be a condition worthy of a
> warning, as the flag somehow got corrupted or there is an internal CMA
> error.  How about WARN_ON_ONCE?
>
> >
> >
> > >               return;
> > > +     }
> > >   #endif
> > >       folio_put(folio);
> > >   }
> > > @@ -1509,6 +1511,9 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
> > >                                       break;
> > >                       }
> > >               }
> > > +
> > > +             if (folio)
> > > +                     folio_set_hugetlb_cma(folio);
> > >       }
> > >   #endif
> > >       if (!folio) {
> > > @@ -3175,6 +3180,63 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> > >       return ERR_PTR(-ENOSPC);
> > >   }
> > >
> > > +/*
> > > + * Some architectures do their own bootmem allocation, so they can't use
> > > + * early CMA allocation. So, allow for this function to be redefined.
> > > + */
> > > +bool __init __attribute((weak))
> >
> > Can't you use __weak ?
> >
> > By the way, do we really need a weak function here ? Can't it be a
> > static inline helper that gets waived by a macro defined by the arch,
> > something like:
> >
> > #ifndef arch_huge_bootmem_alloc
> > static inline arch_huge_bootmem_alloc(struct hstate *h)
> > {
> >         return false;
> > }
> > #endif
> >
> > Then powerpc does:
> >
> > #define arch_huge_bootmem_alloc arch_huge_bootmem_alloc
> > static inline arch_huge_bootmem_alloc(struct hstate *h)
> > {
> >         return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
> > }
> >
>
> Fair enough, yeah. I used a weak symbol because that was already used
> for powerpc with alloc_bootmem_huge_page(), but I can change this one.
>
> >
> > But why is struct hstate *h parameter needed ? Seems like noone uses it.
>
> Correct - I merely extrapolated a bit and thought "well, architectures
> might have bootmem hugetlb allocators that only deal with certain
> sizes". But then again, like you say, there is currently no need for
> it. I'll remove the argument.
>
> >
> > > +arch_specific_huge_bootmem_alloc(struct hstate *h)
> > > +{
> > > +     return false;
> > > +}
> > > +
> > > +static bool __init hugetlb_early_cma(struct hstate *h)
> > > +{
> > > +     if (arch_specific_huge_bootmem_alloc(h))
> > > +             return false;
> > > +
> > > +     return (hstate_is_gigantic(h) && hugetlb_cma_size && hugetlb_cma_only);
> > > +}
> > > +
> > > +static __init void *alloc_bootmem(struct hstate *h, int nid)
> > > +{
> > > +     struct huge_bootmem_page *m;
> > > +     unsigned long flags;
> > > +     struct cma *cma;
> > > +
> > > +#ifdef CONFIG_CMA
> >
> >   #ifdefs in C files should be avoided, see
> > https://docs.kernel.org/process/coding-style.html#conditional-compilation
> >
> > > +     if (hugetlb_early_cma(h)) {
> > > +             flags = HUGE_BOOTMEM_CMA;
> > > +             cma = hugetlb_cma[nid];
> > > +             m = cma_reserve_early(cma, huge_page_size(h));
> > > +     } else
> > > +#endif
> >
> > This kind of if/else construct in uggly, should be avoided.
> >
>
> I found this ifdef hard to avoid, sadly, I tried various ways to avoid
> it (If (IS_ENABLED(CONFIG_CMA), etc), but came up short. I'll have
> another look for v2, but short of trying to split off all CMA-related
> code in to a different file, which would definitely be out of scope
> here, it might not end up being better.

I ended up moving the hugetlb_cma code to its own file anyway, which
is a new patch at the end of the v2 series I just sent.

- Frank
diff mbox series

Patch

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d3c1b749dcfc..e53e4b4c8ef6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -121,6 +121,11 @@  bool __init hugetlb_node_alloc_supported(void)
 {
 	return false;
 }
+
+bool __init arch_specific_huge_bootmem_alloc(struct hstate *h)
+{
+	return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
+}
 #endif
 
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2512463bca49..bca3052fb175 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -591,6 +591,7 @@  enum hugetlb_page_flags {
 	HPG_freed,
 	HPG_vmemmap_optimized,
 	HPG_raw_hwp_unreliable,
+	HPG_cma,
 	__NR_HPAGEFLAGS,
 };
 
@@ -650,6 +651,7 @@  HPAGEFLAG(Temporary, temporary)
 HPAGEFLAG(Freed, freed)
 HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
 HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
+HPAGEFLAG(Cma, cma)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -678,14 +680,18 @@  struct hstate {
 	char name[HSTATE_NAME_LEN];
 };
 
+struct cma;
+
 struct huge_bootmem_page {
 	struct list_head list;
 	struct hstate *hstate;
 	unsigned long flags;
+	struct cma *cma;
 };
 
 #define HUGE_BOOTMEM_HVO		0x0001
 #define HUGE_BOOTMEM_ZONES_VALID	0x0002
+#define HUGE_BOOTMEM_CMA		0x0004
 
 bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
 
@@ -711,6 +717,7 @@  bool __init hugetlb_node_alloc_supported(void);
 
 void __init hugetlb_add_hstate(unsigned order);
 bool __init arch_hugetlb_valid_size(unsigned long size);
+bool __init arch_specific_huge_bootmem_alloc(struct hstate *h);
 struct hstate *size_to_hstate(unsigned long size);
 
 #ifndef HUGE_MAX_HSTATE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32ebde9039e2..183e8d0c2fb4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,7 +61,7 @@  static struct cma *hugetlb_cma[MAX_NUMNODES];
 static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
 #endif
 static bool hugetlb_cma_only;
-static unsigned long hugetlb_cma_size __initdata;
+static unsigned long hugetlb_cma_size;
 
 __initdata struct list_head huge_boot_pages[MAX_NUMNODES];
 __initdata unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE];
@@ -132,8 +132,10 @@  static void hugetlb_free_folio(struct folio *folio)
 #ifdef CONFIG_CMA
 	int nid = folio_nid(folio);
 
-	if (cma_free_folio(hugetlb_cma[nid], folio))
+	if (folio_test_hugetlb_cma(folio)) {
+		WARN_ON(!cma_free_folio(hugetlb_cma[nid], folio));
 		return;
+	}
 #endif
 	folio_put(folio);
 }
@@ -1509,6 +1511,9 @@  static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
 					break;
 			}
 		}
+
+		if (folio)
+			folio_set_hugetlb_cma(folio);
 	}
 #endif
 	if (!folio) {
@@ -3175,6 +3180,63 @@  struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	return ERR_PTR(-ENOSPC);
 }
 
+/*
+ * Some architectures do their own bootmem allocation, so they can't use
+ * early CMA allocation. So, allow for this function to be redefined.
+ */
+bool __init __attribute((weak))
+arch_specific_huge_bootmem_alloc(struct hstate *h)
+{
+	return false;
+}
+
+static bool __init hugetlb_early_cma(struct hstate *h)
+{
+	if (arch_specific_huge_bootmem_alloc(h))
+		return false;
+
+	return (hstate_is_gigantic(h) && hugetlb_cma_size && hugetlb_cma_only);
+}
+
+static __init void *alloc_bootmem(struct hstate *h, int nid)
+{
+	struct huge_bootmem_page *m;
+	unsigned long flags;
+	struct cma *cma;
+
+#ifdef CONFIG_CMA
+	if (hugetlb_early_cma(h)) {
+		flags = HUGE_BOOTMEM_CMA;
+		cma = hugetlb_cma[nid];
+		m = cma_reserve_early(cma, huge_page_size(h));
+	} else
+#endif
+	{
+		flags = 0;
+		cma = NULL;
+		m = memblock_alloc_try_nid_raw(huge_page_size(h),
+			huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+	}
+
+	if (m) {
+		/*
+		 * Use the beginning of the huge page to store the
+		 * huge_bootmem_page struct (until gather_bootmem
+		 * puts them into the mem_map).
+		 *
+		 * Put them into a private list first because mem_map
+		 * is not up yet.
+		 */
+		INIT_LIST_HEAD(&m->list);
+		list_add(&m->list, &huge_boot_pages[nid]);
+		m->hstate = h;
+		m->flags = flags;
+		m->cma = cma;
+	}
+
+	return m;
+}
+
 int alloc_bootmem_huge_page(struct hstate *h, int nid)
 	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
@@ -3184,17 +3246,14 @@  int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 
 	/* do node specific alloc */
 	if (nid != NUMA_NO_NODE) {
-		m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
-				0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+		m = alloc_bootmem(h, node);
 		if (!m)
 			return 0;
 		goto found;
 	}
 	/* allocate from next node when distributing huge pages */
 	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
-		m = memblock_alloc_try_nid_raw(
-				huge_page_size(h), huge_page_size(h),
-				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
+		m = alloc_bootmem(h, node);
 		if (m)
 			break;
 	}
@@ -3203,7 +3262,6 @@  int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 		return 0;
 
 found:
-
 	/*
 	 * Only initialize the head struct page in memmap_init_reserved_pages,
 	 * rest of the struct pages will be initialized by the HugeTLB
@@ -3213,18 +3271,6 @@  int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 	 */
 	memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
 		huge_page_size(h) - PAGE_SIZE);
-	/*
-	 * Use the beginning of the huge page to store the
-	 * huge_bootmem_page struct (until gather_bootmem
-	 * puts them into the mem_map).
-	 *
-	 * Put them into a private list first because mem_map
-	 * is not up yet.
-	 */
-	INIT_LIST_HEAD(&m->list);
-	list_add(&m->list, &huge_boot_pages[node]);
-	m->hstate = h;
-	m->flags = 0;
 	return 1;
 }
 
@@ -3265,13 +3311,25 @@  static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
 	prep_compound_head((struct page *)folio, huge_page_order(h));
 }
 
+static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
+{
+	return (m->flags & HUGE_BOOTMEM_HVO);
+}
+
+static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
+{
+	return (m->flags & HUGE_BOOTMEM_CMA);
+}
+
 /*
  * memblock-allocated pageblocks might not have the migrate type set
  * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
- * here.
+ * here, or MIGRATE_CMA if this was a page allocated through an early CMA
+ * reservation.
  *
- * Note that this will not write the page struct, it is ok (and necessary)
- * to do this on vmemmap optimized folios.
+ * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
+ * read-only, but that's ok - for sparse vmemmap this does not write to
+ * the page structure.
  */
 static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
 							  struct hstate *h)
@@ -3280,9 +3338,13 @@  static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
 
 	WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
 
-	for (i = 0; i < nr_pages; i += pageblock_nr_pages)
-		set_pageblock_migratetype(folio_page(folio, i),
+	for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+		if (folio_test_hugetlb_cma(folio))
+			init_cma_pageblock(folio_page(folio, i));
+		else
+			set_pageblock_migratetype(folio_page(folio, i),
 					  MIGRATE_MOVABLE);
+	}
 }
 
 static void __init prep_and_add_bootmem_folios(struct hstate *h,
@@ -3319,7 +3381,7 @@  bool __init hugetlb_bootmem_page_zones_valid(int nid,
 					     struct huge_bootmem_page *m)
 {
 	unsigned long start_pfn;
-	bool valid;
+	bool valid = false;
 
 	if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
 		/*
@@ -3328,10 +3390,16 @@  bool __init hugetlb_bootmem_page_zones_valid(int nid,
 		return true;
 	}
 
+	if (hugetlb_bootmem_page_earlycma(m)) {
+		valid = cma_validate_zones(m->cma);
+		goto out;
+	}
+
 	start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
 
 	valid = !pfn_range_intersects_zones(nid, start_pfn,
 			pages_per_huge_page(m->hstate));
+out:
 	if (!valid)
 		hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
 
@@ -3360,11 +3428,6 @@  static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
 	}
 }
 
-static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
-{
-	return (m->flags & HUGE_BOOTMEM_HVO);
-}
-
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
  * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
@@ -3414,6 +3477,9 @@  static void __init gather_bootmem_prealloc_node(unsigned long nid)
 			 */
 			folio_set_hugetlb_vmemmap_optimized(folio);
 
+		if (hugetlb_bootmem_page_earlycma(m))
+			folio_set_hugetlb_cma(folio);
+
 		list_add(&folio->lru, &folio_list);
 
 		/*
@@ -3606,8 +3672,11 @@  static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
 	unsigned long allocated;
 
-	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
-	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+	/*
+	 * Skip gigantic hugepages allocation if early CMA
+	 * reservations are not available.
+	 */
+	if (hstate_is_gigantic(h) && hugetlb_cma_size && !hugetlb_early_cma(h)) {
 		pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
 		return;
 	}