diff mbox series

[v2,05/19] mm/hugetlb: Introduce pgtable allocation/freeing helpers

Message ID 20201026145114.59424-6-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of hugetlb page | expand

Commit Message

Muchun Song Oct. 26, 2020, 2:51 p.m. UTC
On some architectures, the vmemmap areas use huge page mapping.
If we want to free the unused vmemmap pages, we have to split
the huge pmd firstly. So we should pre-allocate pgtable to split
huge pmd.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/x86/include/asm/hugetlb.h |   5 ++
 include/linux/hugetlb.h        |  17 +++++
 mm/hugetlb.c                   | 117 +++++++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)

Comments

Mike Kravetz Oct. 28, 2020, 12:32 a.m. UTC | #1
On 10/26/20 7:51 AM, Muchun Song wrote:
> On some architectures, the vmemmap areas use huge page mapping.
> If we want to free the unused vmemmap pages, we have to split
> the huge pmd firstly. So we should pre-allocate pgtable to split
> huge pmd.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  arch/x86/include/asm/hugetlb.h |   5 ++
>  include/linux/hugetlb.h        |  17 +++++
>  mm/hugetlb.c                   | 117 +++++++++++++++++++++++++++++++++
>  3 files changed, 139 insertions(+)
> 
> diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> index 1721b1aadeb1..f5e882f999cd 100644
> --- a/arch/x86/include/asm/hugetlb.h
> +++ b/arch/x86/include/asm/hugetlb.h
> @@ -5,6 +5,11 @@
>  #include <asm/page.h>
>  #include <asm-generic/hugetlb.h>
>  
> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +#define VMEMMAP_HPAGE_SHIFT			PMD_SHIFT
> +#define arch_vmemmap_support_huge_mapping()	boot_cpu_has(X86_FEATURE_PSE)
> +#endif
> +
>  #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
>  
>  #endif /* _ASM_X86_HUGETLB_H */
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index eed3dd3bd626..ace304a6196c 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -593,6 +593,23 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
>  
>  #include <asm/hugetlb.h>
>  
> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +#ifndef arch_vmemmap_support_huge_mapping
> +static inline bool arch_vmemmap_support_huge_mapping(void)
> +{
> +	return false;
> +}
> +#endif
> +
> +#ifndef VMEMMAP_HPAGE_SHIFT
> +#define VMEMMAP_HPAGE_SHIFT		PMD_SHIFT
> +#endif
> +#define VMEMMAP_HPAGE_ORDER		(VMEMMAP_HPAGE_SHIFT - PAGE_SHIFT)
> +#define VMEMMAP_HPAGE_NR		(1 << VMEMMAP_HPAGE_ORDER)
> +#define VMEMMAP_HPAGE_SIZE		((1UL) << VMEMMAP_HPAGE_SHIFT)
> +#define VMEMMAP_HPAGE_MASK		(~(VMEMMAP_HPAGE_SIZE - 1))
> +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> +
>  #ifndef is_hugepage_only_range
>  static inline int is_hugepage_only_range(struct mm_struct *mm,
>  					unsigned long addr, unsigned long len)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f1b2b733b49b..d6ae9b6876be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1295,11 +1295,108 @@ static inline void destroy_compound_gigantic_page(struct page *page,
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
>  #define RESERVE_VMEMMAP_NR	2U
>  
> +#define page_huge_pte(page)	((page)->pmd_huge_pte)
> +

I am not good at function names.  The following suggestions may be too
verbose.  However, they helped me understand purpose of routines.

>  static inline unsigned int nr_free_vmemmap(struct hstate *h)

	perhaps?	 	free_vmemmap_pages_per_hpage()

>  {
>  	return h->nr_free_vmemmap_pages;
>  }
>  
> +static inline unsigned int nr_vmemmap(struct hstate *h)

	perhaps?		vmemmap_pages_per_hpage()

> +{
> +	return nr_free_vmemmap(h) + RESERVE_VMEMMAP_NR;
> +}
> +
> +static inline unsigned long nr_vmemmap_size(struct hstate *h)

	perhaps?		vmemmap_pages_size_per_hpage()

> +{
> +	return (unsigned long)nr_vmemmap(h) << PAGE_SHIFT;
> +}
> +
> +static inline unsigned int nr_pgtable(struct hstate *h)

	perhaps?	pgtable_pages_to_prealloc_per_hpage()

> +{
> +	unsigned long vmemmap_size = nr_vmemmap_size(h);
> +
> +	if (!arch_vmemmap_support_huge_mapping())
> +		return 0;
> +
> +	/*
> +	 * No need pre-allocate page tabels when there is no vmemmap pages
> +	 * to free.
> +	 */
> +	if (!nr_free_vmemmap(h))
> +		return 0;
> +
> +	return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
> +}
> +
> +static inline void vmemmap_pgtable_init(struct page *page)
> +{
> +	page_huge_pte(page) = NULL;
> +}
> +

I see the following routines follow the pattern for vmemmap manipulation
in dax.

> +static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
> +{
> +	pgtable_t pgtable = virt_to_page(pte_p);
> +
> +	/* FIFO */
> +	if (!page_huge_pte(page))
> +		INIT_LIST_HEAD(&pgtable->lru);
> +	else
> +		list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> +	page_huge_pte(page) = pgtable;
> +}
> +
> +static pte_t *vmemmap_pgtable_withdraw(struct page *page)
> +{
> +	pgtable_t pgtable;
> +
> +	/* FIFO */
> +	pgtable = page_huge_pte(page);
> +	if (unlikely(!pgtable))
> +		return NULL;
> +	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> +						       struct page, lru);
> +	if (page_huge_pte(page))
> +		list_del(&pgtable->lru);
> +	return page_to_virt(pgtable);
> +}
> +
> +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> +{
> +	int i;
> +	pte_t *pte_p;
> +	unsigned int nr = nr_pgtable(h);
> +
> +	if (!nr)
> +		return 0;
> +
> +	vmemmap_pgtable_init(page);
> +
> +	for (i = 0; i < nr; i++) {
> +		pte_p = pte_alloc_one_kernel(&init_mm);
> +		if (!pte_p)
> +			goto out;
> +		vmemmap_pgtable_deposit(page, pte_p);
> +	}
> +
> +	return 0;
> +out:
> +	while (i-- && (pte_p = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, pte_p);
> +	return -ENOMEM;
> +}
> +
> +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> +{
> +	pte_t *pte_p;
> +
> +	if (!nr_pgtable(h))
> +		return;
> +
> +	while ((pte_p = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, pte_p);
> +}
> +
>  static void __init hugetlb_vmemmap_init(struct hstate *h)
>  {
>  	unsigned int order = huge_page_order(h);
> @@ -1323,6 +1420,15 @@ static void __init hugetlb_vmemmap_init(struct hstate *h)
>  static inline void hugetlb_vmemmap_init(struct hstate *h)
>  {
>  }
> +
> +static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> +{
> +	return 0;
> +}
> +
> +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> +{
> +}
>  #endif
>  
>  static void update_and_free_page(struct hstate *h, struct page *page)
> @@ -1531,6 +1637,9 @@ void free_huge_page(struct page *page)
>  
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  {
> +	/* Must be called before the initialization of @page->lru */
> +	vmemmap_pgtable_free(h, page);
> +
>  	INIT_LIST_HEAD(&page->lru);
>  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>  	set_hugetlb_cgroup(page, NULL);
> @@ -1783,6 +1892,14 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
>  	if (!page)
>  		return NULL;
>  
> +	if (vmemmap_pgtable_prealloc(h, page)) {
> +		if (hstate_is_gigantic(h))
> +			free_gigantic_page(page, huge_page_order(h));
> +		else
> +			put_page(page);
> +		return NULL;
> +	}
> +

It seems a bit strange that we will fail a huge page allocation if
vmemmap_pgtable_prealloc fails.  Not sure, but it almost seems like we shold
allow the allocation and log a warning?  It is somewhat unfortunate that
we need to allocate a page to free pages.

>  	if (hstate_is_gigantic(h))
>  		prep_compound_gigantic_page(page, huge_page_order(h));
>  	prep_new_huge_page(h, page, page_to_nid(page));
>
Muchun Song Oct. 28, 2020, 7:26 a.m. UTC | #2
On Wed, Oct 28, 2020 at 8:33 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 10/26/20 7:51 AM, Muchun Song wrote:
> > On some architectures, the vmemmap areas use huge page mapping.
> > If we want to free the unused vmemmap pages, we have to split
> > the huge pmd firstly. So we should pre-allocate pgtable to split
> > huge pmd.
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  arch/x86/include/asm/hugetlb.h |   5 ++
> >  include/linux/hugetlb.h        |  17 +++++
> >  mm/hugetlb.c                   | 117 +++++++++++++++++++++++++++++++++
> >  3 files changed, 139 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> > index 1721b1aadeb1..f5e882f999cd 100644
> > --- a/arch/x86/include/asm/hugetlb.h
> > +++ b/arch/x86/include/asm/hugetlb.h
> > @@ -5,6 +5,11 @@
> >  #include <asm/page.h>
> >  #include <asm-generic/hugetlb.h>
> >
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#define VMEMMAP_HPAGE_SHIFT                  PMD_SHIFT
> > +#define arch_vmemmap_support_huge_mapping()  boot_cpu_has(X86_FEATURE_PSE)
> > +#endif
> > +
> >  #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
> >
> >  #endif /* _ASM_X86_HUGETLB_H */
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index eed3dd3bd626..ace304a6196c 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -593,6 +593,23 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
> >
> >  #include <asm/hugetlb.h>
> >
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#ifndef arch_vmemmap_support_huge_mapping
> > +static inline bool arch_vmemmap_support_huge_mapping(void)
> > +{
> > +     return false;
> > +}
> > +#endif
> > +
> > +#ifndef VMEMMAP_HPAGE_SHIFT
> > +#define VMEMMAP_HPAGE_SHIFT          PMD_SHIFT
> > +#endif
> > +#define VMEMMAP_HPAGE_ORDER          (VMEMMAP_HPAGE_SHIFT - PAGE_SHIFT)
> > +#define VMEMMAP_HPAGE_NR             (1 << VMEMMAP_HPAGE_ORDER)
> > +#define VMEMMAP_HPAGE_SIZE           ((1UL) << VMEMMAP_HPAGE_SHIFT)
> > +#define VMEMMAP_HPAGE_MASK           (~(VMEMMAP_HPAGE_SIZE - 1))
> > +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> > +
> >  #ifndef is_hugepage_only_range
> >  static inline int is_hugepage_only_range(struct mm_struct *mm,
> >                                       unsigned long addr, unsigned long len)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index f1b2b733b49b..d6ae9b6876be 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1295,11 +1295,108 @@ static inline void destroy_compound_gigantic_page(struct page *page,
> >  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> >  #define RESERVE_VMEMMAP_NR   2U
> >
> > +#define page_huge_pte(page)  ((page)->pmd_huge_pte)
> > +
>
> I am not good at function names.  The following suggestions may be too
> verbose.  However, they helped me understand purpose of routines.
>
> >  static inline unsigned int nr_free_vmemmap(struct hstate *h)
>
>         perhaps?                free_vmemmap_pages_per_hpage()
>
> >  {
> >       return h->nr_free_vmemmap_pages;
> >  }
> >
> > +static inline unsigned int nr_vmemmap(struct hstate *h)
>
>         perhaps?                vmemmap_pages_per_hpage()
>
> > +{
> > +     return nr_free_vmemmap(h) + RESERVE_VMEMMAP_NR;
> > +}
> > +
> > +static inline unsigned long nr_vmemmap_size(struct hstate *h)
>
>         perhaps?                vmemmap_pages_size_per_hpage()
>
> > +{
> > +     return (unsigned long)nr_vmemmap(h) << PAGE_SHIFT;
> > +}
> > +
> > +static inline unsigned int nr_pgtable(struct hstate *h)
>
>         perhaps?        pgtable_pages_to_prealloc_per_hpage()

Good suggestions. Thanks. I will apply this.

>
> > +{
> > +     unsigned long vmemmap_size = nr_vmemmap_size(h);
> > +
> > +     if (!arch_vmemmap_support_huge_mapping())
> > +             return 0;
> > +
> > +     /*
> > +      * No need pre-allocate page tabels when there is no vmemmap pages
> > +      * to free.
> > +      */
> > +     if (!nr_free_vmemmap(h))
> > +             return 0;
> > +
> > +     return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
> > +}
> > +
> > +static inline void vmemmap_pgtable_init(struct page *page)
> > +{
> > +     page_huge_pte(page) = NULL;
> > +}
> > +
>
> I see the following routines follow the pattern for vmemmap manipulation
> in dax.

Did you mean move those functions to mm/sparse-vmemmap.c?

>
> > +static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
> > +{
> > +     pgtable_t pgtable = virt_to_page(pte_p);
> > +
> > +     /* FIFO */
> > +     if (!page_huge_pte(page))
> > +             INIT_LIST_HEAD(&pgtable->lru);
> > +     else
> > +             list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> > +     page_huge_pte(page) = pgtable;
> > +}
> > +
> > +static pte_t *vmemmap_pgtable_withdraw(struct page *page)
> > +{
> > +     pgtable_t pgtable;
> > +
> > +     /* FIFO */
> > +     pgtable = page_huge_pte(page);
> > +     if (unlikely(!pgtable))
> > +             return NULL;
> > +     page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> > +                                                    struct page, lru);
> > +     if (page_huge_pte(page))
> > +             list_del(&pgtable->lru);
> > +     return page_to_virt(pgtable);
> > +}
> > +
> > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > +{
> > +     int i;
> > +     pte_t *pte_p;
> > +     unsigned int nr = nr_pgtable(h);
> > +
> > +     if (!nr)
> > +             return 0;
> > +
> > +     vmemmap_pgtable_init(page);
> > +
> > +     for (i = 0; i < nr; i++) {
> > +             pte_p = pte_alloc_one_kernel(&init_mm);
> > +             if (!pte_p)
> > +                     goto out;
> > +             vmemmap_pgtable_deposit(page, pte_p);
> > +     }
> > +
> > +     return 0;
> > +out:
> > +     while (i-- && (pte_p = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, pte_p);
> > +     return -ENOMEM;
> > +}
> > +
> > +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> > +{
> > +     pte_t *pte_p;
> > +
> > +     if (!nr_pgtable(h))
> > +             return;
> > +
> > +     while ((pte_p = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, pte_p);
> > +}
> > +
> >  static void __init hugetlb_vmemmap_init(struct hstate *h)
> >  {
> >       unsigned int order = huge_page_order(h);
> > @@ -1323,6 +1420,15 @@ static void __init hugetlb_vmemmap_init(struct hstate *h)
> >  static inline void hugetlb_vmemmap_init(struct hstate *h)
> >  {
> >  }
> > +
> > +static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > +{
> > +     return 0;
> > +}
> > +
> > +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> > +{
> > +}
> >  #endif
> >
> >  static void update_and_free_page(struct hstate *h, struct page *page)
> > @@ -1531,6 +1637,9 @@ void free_huge_page(struct page *page)
> >
> >  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >  {
> > +     /* Must be called before the initialization of @page->lru */
> > +     vmemmap_pgtable_free(h, page);
> > +
> >       INIT_LIST_HEAD(&page->lru);
> >       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> >       set_hugetlb_cgroup(page, NULL);
> > @@ -1783,6 +1892,14 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
> >       if (!page)
> >               return NULL;
> >
> > +     if (vmemmap_pgtable_prealloc(h, page)) {
> > +             if (hstate_is_gigantic(h))
> > +                     free_gigantic_page(page, huge_page_order(h));
> > +             else
> > +                     put_page(page);
> > +             return NULL;
> > +     }
> > +
>
> It seems a bit strange that we will fail a huge page allocation if
> vmemmap_pgtable_prealloc fails.  Not sure, but it almost seems like we shold
> allow the allocation and log a warning?  It is somewhat unfortunate that
> we need to allocate a page to free pages.

Yeah, it seems unfortunate. But if we allocate success, we can free some
vmemmap pages later. Like a compromise :) . If we can successfully allocate
a huge page, I also prefer to be able to successfully allocate another one page.
If we allow the allocation when vmemmap_pgtable_prealloc fails, we also
need to mark this page that vmemmap has not been released. Seems
increase complexity.

Thanks.

>
> >       if (hstate_is_gigantic(h))
> >               prep_compound_gigantic_page(page, huge_page_order(h));
> >       prep_new_huge_page(h, page, page_to_nid(page));
> >
>
>
> --
> Mike Kravetz
Mike Kravetz Oct. 28, 2020, 11:42 p.m. UTC | #3
On 10/28/20 12:26 AM, Muchun Song wrote:
> On Wed, Oct 28, 2020 at 8:33 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>> On 10/26/20 7:51 AM, Muchun Song wrote:
>>
>> I see the following routines follow the pattern for vmemmap manipulation
>> in dax.
> 
> Did you mean move those functions to mm/sparse-vmemmap.c?

No.  Sorry, that was mostly a not to myself.

>>> +static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
>>> +{
>>> +     pgtable_t pgtable = virt_to_page(pte_p);
>>> +
>>> +     /* FIFO */
>>> +     if (!page_huge_pte(page))
>>> +             INIT_LIST_HEAD(&pgtable->lru);
>>> +     else
>>> +             list_add(&pgtable->lru, &page_huge_pte(page)->lru);
>>> +     page_huge_pte(page) = pgtable;
>>> +}
>>> +
>>> +static pte_t *vmemmap_pgtable_withdraw(struct page *page)
>>> +{
>>> +     pgtable_t pgtable;
>>> +
>>> +     /* FIFO */
>>> +     pgtable = page_huge_pte(page);
>>> +     if (unlikely(!pgtable))
>>> +             return NULL;
>>> +     page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
>>> +                                                    struct page, lru);
>>> +     if (page_huge_pte(page))
>>> +             list_del(&pgtable->lru);
>>> +     return page_to_virt(pgtable);
>>> +}
>>> +
...
>>> @@ -1783,6 +1892,14 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
>>>       if (!page)
>>>               return NULL;
>>>
>>> +     if (vmemmap_pgtable_prealloc(h, page)) {
>>> +             if (hstate_is_gigantic(h))
>>> +                     free_gigantic_page(page, huge_page_order(h));
>>> +             else
>>> +                     put_page(page);
>>> +             return NULL;
>>> +     }
>>> +
>>
>> It seems a bit strange that we will fail a huge page allocation if
>> vmemmap_pgtable_prealloc fails.  Not sure, but it almost seems like we shold
>> allow the allocation and log a warning?  It is somewhat unfortunate that
>> we need to allocate a page to free pages.
> 
> Yeah, it seems unfortunate. But if we allocate success, we can free some
> vmemmap pages later. Like a compromise :) . If we can successfully allocate
> a huge page, I also prefer to be able to successfully allocate another one page.
> If we allow the allocation when vmemmap_pgtable_prealloc fails, we also
> need to mark this page that vmemmap has not been released. Seems
> increase complexity.

Yes, I think it is better to leave code as it is and avoid complexity.
Oscar Salvador Nov. 5, 2020, 1:23 p.m. UTC | #4
On Mon, Oct 26, 2020 at 10:51:00PM +0800, Muchun Song wrote:
> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +#define VMEMMAP_HPAGE_SHIFT			PMD_SHIFT
> +#define arch_vmemmap_support_huge_mapping()	boot_cpu_has(X86_FEATURE_PSE)

I do not think you need this.
We already have hugepages_supported().

> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +#ifndef arch_vmemmap_support_huge_mapping
> +static inline bool arch_vmemmap_support_huge_mapping(void)
> +{
> +	return false;
> +}

Same as above

>  static inline unsigned int nr_free_vmemmap(struct hstate *h)
>  {
>  	return h->nr_free_vmemmap_pages;
>  }
>  
> +static inline unsigned int nr_vmemmap(struct hstate *h)
> +{
> +	return nr_free_vmemmap(h) + RESERVE_VMEMMAP_NR;
> +}
> +
> +static inline unsigned long nr_vmemmap_size(struct hstate *h)
> +{
> +	return (unsigned long)nr_vmemmap(h) << PAGE_SHIFT;
> +}
> +
> +static inline unsigned int nr_pgtable(struct hstate *h)
> +{
> +	unsigned long vmemmap_size = nr_vmemmap_size(h);
> +
> +	if (!arch_vmemmap_support_huge_mapping())
> +		return 0;
> +
> +	/*
> +	 * No need pre-allocate page tabels when there is no vmemmap pages
> +	 * to free.
> +	 */
> +	if (!nr_free_vmemmap(h))
> +		return 0;
> +
> +	return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
> +}

IMHO, Mike's naming suggestion fit much better.

> +static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
> +{
> +	pgtable_t pgtable = virt_to_page(pte_p);
> +
> +	/* FIFO */
> +	if (!page_huge_pte(page))
> +		INIT_LIST_HEAD(&pgtable->lru);
> +	else
> +		list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> +	page_huge_pte(page) = pgtable;
> +}

I think it would make more sense if this took a pgtable argument
instead of a pte_t *.

> +static pte_t *vmemmap_pgtable_withdraw(struct page *page)
> +{
> +	pgtable_t pgtable;
> +
> +	/* FIFO */
> +	pgtable = page_huge_pte(page);
> +	if (unlikely(!pgtable))
> +		return NULL;

AFAICS, above check only needs to be run once.
It think we can move it to vmemmap_pgtable_free, can't we?

> +	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> +						       struct page, lru);
> +	if (page_huge_pte(page))
> +		list_del(&pgtable->lru);
> +	return page_to_virt(pgtable);
> +}

At the risk of adding more code, I think it would be nice to return a
pagetable_t?
So it is more coherent with the name of the function and with what
we are doing.

It is a pitty we cannot converge these and pgtable_trans_huge_*.
They share some code but it is different enough.

> +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> +{
> +	int i;
> +	pte_t *pte_p;
> +	unsigned int nr = nr_pgtable(h);
> +
> +	if (!nr)
> +		return 0;
> +
> +	vmemmap_pgtable_init(page);

Maybe just open code this one?

> +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> +{
> +	pte_t *pte_p;
> +
> +	if (!nr_pgtable(h))
> +		return;
> +
> +	while ((pte_p = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, pte_p);

As mentioned above, move the pgtable_t check from vmemmap_pgtable_withdraw
in here.

  
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  {
> +	/* Must be called before the initialization of @page->lru */
> +	vmemmap_pgtable_free(h, page);
> +
>  	INIT_LIST_HEAD(&page->lru);
>  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>  	set_hugetlb_cgroup(page, NULL);
> @@ -1783,6 +1892,14 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
>  	if (!page)
>  		return NULL;
>  
> +	if (vmemmap_pgtable_prealloc(h, page)) {
> +		if (hstate_is_gigantic(h))
> +			free_gigantic_page(page, huge_page_order(h));
> +		else
> +			put_page(page);
> +		return NULL;
> +	}
> +

I must confess I am bit puzzled.

IIUC, in this patch we are just adding the helpers to create/tear the page
tables.
I do not think we actually need to call vmemmap_pgtable_prealloc/vmemmap_pgtable_free, do we?
In the end, we are just allocating pages for pagetables and then free them shortly.

I think it would make more sense to add the calls when they need to be?
Muchun Song Nov. 5, 2020, 4:08 p.m. UTC | #5
On Thu, Nov 5, 2020 at 9:23 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Mon, Oct 26, 2020 at 10:51:00PM +0800, Muchun Song wrote:
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#define VMEMMAP_HPAGE_SHIFT                  PMD_SHIFT
> > +#define arch_vmemmap_support_huge_mapping()  boot_cpu_has(X86_FEATURE_PSE)
>
> I do not think you need this.
> We already have hugepages_supported().

Maybe some architectures support hugepage, but the vmemmap do not
use the hugepage map. In  this case, we need it. But I am not sure if it
exists in the real world. At least, x86 can reuse hugepages_supported.

>
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#ifndef arch_vmemmap_support_huge_mapping
> > +static inline bool arch_vmemmap_support_huge_mapping(void)
> > +{
> > +     return false;
> > +}
>
> Same as above
>
> >  static inline unsigned int nr_free_vmemmap(struct hstate *h)
> >  {
> >       return h->nr_free_vmemmap_pages;
> >  }
> >
> > +static inline unsigned int nr_vmemmap(struct hstate *h)
> > +{
> > +     return nr_free_vmemmap(h) + RESERVE_VMEMMAP_NR;
> > +}
> > +
> > +static inline unsigned long nr_vmemmap_size(struct hstate *h)
> > +{
> > +     return (unsigned long)nr_vmemmap(h) << PAGE_SHIFT;
> > +}
> > +
> > +static inline unsigned int nr_pgtable(struct hstate *h)
> > +{
> > +     unsigned long vmemmap_size = nr_vmemmap_size(h);
> > +
> > +     if (!arch_vmemmap_support_huge_mapping())
> > +             return 0;
> > +
> > +     /*
> > +      * No need pre-allocate page tabels when there is no vmemmap pages
> > +      * to free.
> > +      */
> > +     if (!nr_free_vmemmap(h))
> > +             return 0;
> > +
> > +     return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
> > +}
>
> IMHO, Mike's naming suggestion fit much better.

I will do that.

>
> > +static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
> > +{
> > +     pgtable_t pgtable = virt_to_page(pte_p);
> > +
> > +     /* FIFO */
> > +     if (!page_huge_pte(page))
> > +             INIT_LIST_HEAD(&pgtable->lru);
> > +     else
> > +             list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> > +     page_huge_pte(page) = pgtable;
> > +}
>
> I think it would make more sense if this took a pgtable argument
> instead of a pte_t *.

Will do. Thanks for your suggestions.

>
> > +static pte_t *vmemmap_pgtable_withdraw(struct page *page)
> > +{
> > +     pgtable_t pgtable;
> > +
> > +     /* FIFO */
> > +     pgtable = page_huge_pte(page);
> > +     if (unlikely(!pgtable))
> > +             return NULL;
>
> AFAICS, above check only needs to be run once.
> It think we can move it to vmemmap_pgtable_free, can't we?

Yeah, we can. Thanks.

>
> > +     page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> > +                                                    struct page, lru);
> > +     if (page_huge_pte(page))
> > +             list_del(&pgtable->lru);
> > +     return page_to_virt(pgtable);
> > +}
>
> At the risk of adding more code, I think it would be nice to return a
> pagetable_t?
> So it is more coherent with the name of the function and with what
> we are doing.

Yeah.

>
> It is a pity we cannot converge these and pgtable_trans_huge_*.
> They share some code but it is different enough.
>
> > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > +{
> > +     int i;
> > +     pte_t *pte_p;
> > +     unsigned int nr = nr_pgtable(h);
> > +
> > +     if (!nr)
> > +             return 0;
> > +
> > +     vmemmap_pgtable_init(page);
>
> Maybe just open code this one?

Sorry. I don't quite understand what it means. Could you explain?


>
> > +static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> > +{
> > +     pte_t *pte_p;
> > +
> > +     if (!nr_pgtable(h))
> > +             return;
> > +
> > +     while ((pte_p = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, pte_p);
>
> As mentioned above, move the pgtable_t check from vmemmap_pgtable_withdraw
> in here.

OK.

>
>
> >  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >  {
> > +     /* Must be called before the initialization of @page->lru */
> > +     vmemmap_pgtable_free(h, page);
> > +
> >       INIT_LIST_HEAD(&page->lru);
> >       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> >       set_hugetlb_cgroup(page, NULL);
> > @@ -1783,6 +1892,14 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
> >       if (!page)
> >               return NULL;
> >
> > +     if (vmemmap_pgtable_prealloc(h, page)) {
> > +             if (hstate_is_gigantic(h))
> > +                     free_gigantic_page(page, huge_page_order(h));
> > +             else
> > +                     put_page(page);
> > +             return NULL;
> > +     }
> > +
>
> I must confess I am bit puzzled.
>
> IIUC, in this patch we are just adding the helpers to create/tear the page
> tables.
> I do not think we actually need to call vmemmap_pgtable_prealloc/vmemmap_pgtable_free, do we?
> In the end, we are just allocating pages for pagetables and then free them shortly.
>
> I think it would make more sense to add the calls when they need to be?

OK,  will do. Thanks very much.

>
>
> --
> Oscar Salvador
> SUSE L3



--
Yours,
Muchun
Oscar Salvador Nov. 6, 2020, 9:46 a.m. UTC | #6
On Fri, Nov 06, 2020 at 12:08:22AM +0800, Muchun Song wrote:
> > I do not think you need this.
> > We already have hugepages_supported().
> 
> Maybe some architectures support hugepage, but the vmemmap do not
> use the hugepage map. In  this case, we need it. But I am not sure if it
> exists in the real world. At least, x86 can reuse hugepages_supported.

Yes, but that is the point.
IIUC, this patchset will enable HugeTLB vmemmap pages only for x86_64.
Then, let us make the patchset specific to that architecture.

If at some point this grows more users (powerpc, arm, ...), then we
can add the missing code, but for now it makes sense to only include
the bits to make this work on x86_64.

And also according to this the changelog is a bit "misleading".

"On some architectures, the vmemmap areas use huge page mapping.
If we want to free the unused vmemmap pages, we have to split
the huge pmd firstly. So we should pre-allocate pgtable to split
huge pmd."

On x86_64, vmemmap is always PMD mapped if the machine has hugepages
support and if we have 2MB contiguos pages and PMD aligned.
e.g: I have seen cases where after the system has ran for a period
of time hotplug operations were mapping the vmemmap representing
the hot-added range on page base, because we could not find
enough contiguos and aligned memory.

Something that [1] tries to solve:

[1] https://patchwork.kernel.org/project/linux-mm/cover/20201022125835.26396-1-osalvador@suse.de/

But anyway, my point is that let us make it clear in the changelog that
this is aimed for x86_64 at the moment.
Saying "on some architures" might make think people that this is not
x86_64 specific.

>> > > +     vmemmap_pgtable_init(page);
> >
> > Maybe just open code this one?
> 
> Sorry. I don't quite understand what it means. Could you explain?

I meant doing 

page_huge_pte(page) = NULL

But no strong feelings.
Muchun Song Nov. 6, 2020, 4:43 p.m. UTC | #7
On Fri, Nov 6, 2020 at 5:46 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Fri, Nov 06, 2020 at 12:08:22AM +0800, Muchun Song wrote:
> > > I do not think you need this.
> > > We already have hugepages_supported().
> >
> > Maybe some architectures support hugepage, but the vmemmap do not
> > use the hugepage map. In  this case, we need it. But I am not sure if it
> > exists in the real world. At least, x86 can reuse hugepages_supported.
>
> Yes, but that is the point.
> IIUC, this patchset will enable HugeTLB vmemmap pages only for x86_64.
> Then, let us make the patchset specific to that architecture.
>
> If at some point this grows more users (powerpc, arm, ...), then we
> can add the missing code, but for now it makes sense to only include
> the bits to make this work on x86_64.
>
> And also according to this the changelog is a bit "misleading".
>
> "On some architectures, the vmemmap areas use huge page mapping.
> If we want to free the unused vmemmap pages, we have to split
> the huge pmd firstly. So we should pre-allocate pgtable to split
> huge pmd."

Thanks for your suggestions. I will rewrite the commit log.

>
> On x86_64, vmemmap is always PMD mapped if the machine has hugepages
> support and if we have 2MB contiguos pages and PMD aligned.
> e.g: I have seen cases where after the system has ran for a period
> of time hotplug operations were mapping the vmemmap representing
> the hot-added range on page base, because we could not find
> enough contiguos and aligned memory.
>
> Something that [1] tries to solve:
>
> [1] https://patchwork.kernel.org/project/linux-mm/cover/20201022125835.26396-1-osalvador@suse.de/
>
> But anyway, my point is that let us make it clear in the changelog that
> this is aimed for x86_64 at the moment.
> Saying "on some architures" might make think people that this is not
> x86_64 specific.
>
> >> > > +     vmemmap_pgtable_init(page);
> > >
> > > Maybe just open code this one?
> >
> > Sorry. I don't quite understand what it means. Could you explain?
>
> I meant doing
>
> page_huge_pte(page) = NULL
>
> But no strong feelings.
>
> --
> Oscar Salvador
> SUSE L3
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 1721b1aadeb1..f5e882f999cd 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -5,6 +5,11 @@ 
 #include <asm/page.h>
 #include <asm-generic/hugetlb.h>
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#define VMEMMAP_HPAGE_SHIFT			PMD_SHIFT
+#define arch_vmemmap_support_huge_mapping()	boot_cpu_has(X86_FEATURE_PSE)
+#endif
+
 #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
 
 #endif /* _ASM_X86_HUGETLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index eed3dd3bd626..ace304a6196c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -593,6 +593,23 @@  static inline unsigned int blocks_per_huge_page(struct hstate *h)
 
 #include <asm/hugetlb.h>
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifndef arch_vmemmap_support_huge_mapping
+static inline bool arch_vmemmap_support_huge_mapping(void)
+{
+	return false;
+}
+#endif
+
+#ifndef VMEMMAP_HPAGE_SHIFT
+#define VMEMMAP_HPAGE_SHIFT		PMD_SHIFT
+#endif
+#define VMEMMAP_HPAGE_ORDER		(VMEMMAP_HPAGE_SHIFT - PAGE_SHIFT)
+#define VMEMMAP_HPAGE_NR		(1 << VMEMMAP_HPAGE_ORDER)
+#define VMEMMAP_HPAGE_SIZE		((1UL) << VMEMMAP_HPAGE_SHIFT)
+#define VMEMMAP_HPAGE_MASK		(~(VMEMMAP_HPAGE_SIZE - 1))
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+
 #ifndef is_hugepage_only_range
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					unsigned long addr, unsigned long len)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f1b2b733b49b..d6ae9b6876be 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1295,11 +1295,108 @@  static inline void destroy_compound_gigantic_page(struct page *page,
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 #define RESERVE_VMEMMAP_NR	2U
 
+#define page_huge_pte(page)	((page)->pmd_huge_pte)
+
 static inline unsigned int nr_free_vmemmap(struct hstate *h)
 {
 	return h->nr_free_vmemmap_pages;
 }
 
+static inline unsigned int nr_vmemmap(struct hstate *h)
+{
+	return nr_free_vmemmap(h) + RESERVE_VMEMMAP_NR;
+}
+
+static inline unsigned long nr_vmemmap_size(struct hstate *h)
+{
+	return (unsigned long)nr_vmemmap(h) << PAGE_SHIFT;
+}
+
+static inline unsigned int nr_pgtable(struct hstate *h)
+{
+	unsigned long vmemmap_size = nr_vmemmap_size(h);
+
+	if (!arch_vmemmap_support_huge_mapping())
+		return 0;
+
+	/*
+	 * No need pre-allocate page tabels when there is no vmemmap pages
+	 * to free.
+	 */
+	if (!nr_free_vmemmap(h))
+		return 0;
+
+	return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
+}
+
+static inline void vmemmap_pgtable_init(struct page *page)
+{
+	page_huge_pte(page) = NULL;
+}
+
+static void vmemmap_pgtable_deposit(struct page *page, pte_t *pte_p)
+{
+	pgtable_t pgtable = virt_to_page(pte_p);
+
+	/* FIFO */
+	if (!page_huge_pte(page))
+		INIT_LIST_HEAD(&pgtable->lru);
+	else
+		list_add(&pgtable->lru, &page_huge_pte(page)->lru);
+	page_huge_pte(page) = pgtable;
+}
+
+static pte_t *vmemmap_pgtable_withdraw(struct page *page)
+{
+	pgtable_t pgtable;
+
+	/* FIFO */
+	pgtable = page_huge_pte(page);
+	if (unlikely(!pgtable))
+		return NULL;
+	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
+						       struct page, lru);
+	if (page_huge_pte(page))
+		list_del(&pgtable->lru);
+	return page_to_virt(pgtable);
+}
+
+static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
+{
+	int i;
+	pte_t *pte_p;
+	unsigned int nr = nr_pgtable(h);
+
+	if (!nr)
+		return 0;
+
+	vmemmap_pgtable_init(page);
+
+	for (i = 0; i < nr; i++) {
+		pte_p = pte_alloc_one_kernel(&init_mm);
+		if (!pte_p)
+			goto out;
+		vmemmap_pgtable_deposit(page, pte_p);
+	}
+
+	return 0;
+out:
+	while (i-- && (pte_p = vmemmap_pgtable_withdraw(page)))
+		pte_free_kernel(&init_mm, pte_p);
+	return -ENOMEM;
+}
+
+static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
+{
+	pte_t *pte_p;
+
+	if (!nr_pgtable(h))
+		return;
+
+	while ((pte_p = vmemmap_pgtable_withdraw(page)))
+		pte_free_kernel(&init_mm, pte_p);
+}
+
 static void __init hugetlb_vmemmap_init(struct hstate *h)
 {
 	unsigned int order = huge_page_order(h);
@@ -1323,6 +1420,15 @@  static void __init hugetlb_vmemmap_init(struct hstate *h)
 static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
 }
+
+static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
+{
+	return 0;
+}
+
+static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
+{
+}
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1531,6 +1637,9 @@  void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+	/* Must be called before the initialization of @page->lru */
+	vmemmap_pgtable_free(h, page);
+
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	set_hugetlb_cgroup(page, NULL);
@@ -1783,6 +1892,14 @@  static struct page *alloc_fresh_huge_page(struct hstate *h,
 	if (!page)
 		return NULL;
 
+	if (vmemmap_pgtable_prealloc(h, page)) {
+		if (hstate_is_gigantic(h))
+			free_gigantic_page(page, huge_page_order(h));
+		else
+			put_page(page);
+		return NULL;
+	}
+
 	if (hstate_is_gigantic(h))
 		prep_compound_gigantic_page(page, huge_page_order(h));
 	prep_new_huge_page(h, page, page_to_nid(page));