diff mbox series

[v3,05/21] mm/hugetlb: Introduce pgtable allocation/freeing helpers

Message ID 20201108141113.65450-6-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of hugetlb page | expand

Commit Message

Muchun Song Nov. 8, 2020, 2:10 p.m. UTC
On x86_64, vmemmap is always PMD mapped if the machine has hugepages
support and if we have 2MB contiguos pages and PMD aligned. If we want
to free the unused vmemmap pages, we have to split the huge pmd firstly.
So we should pre-allocate pgtable to split PMD to PTE.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/hugetlb.h |  10 +++++
 mm/hugetlb.c            | 111 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

Comments

Oscar Salvador Nov. 9, 2020, 5:21 p.m. UTC | #1
On Sun, Nov 08, 2020 at 10:10:57PM +0800, Muchun Song wrote:
> +static inline unsigned int pgtable_pages_to_prealloc_per_hpage(struct hstate *h)
> +{
> +	unsigned long vmemmap_size = vmemmap_pages_size_per_hpage(h);
> +
> +	/*
> +	 * No need pre-allocate page tabels when there is no vmemmap pages
> +	 * to free.
 s /tabels/tables/

> +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> +{
> +	int i;
> +	pgtable_t pgtable;
> +	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> +
> +	if (!nr)
> +		return 0;
> +
> +	vmemmap_pgtable_init(page);
> +
> +	for (i = 0; i < nr; i++) {
> +		pte_t *pte_p;
> +
> +		pte_p = pte_alloc_one_kernel(&init_mm);
> +		if (!pte_p)
> +			goto out;
> +		vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> +	}
> +
> +	return 0;
> +out:
> +	while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, page_to_virt(pgtable));

	would not be enough to:

	while (pgtable = vmemmap_pgtable_withdrag(page))
		pte_free_kernel(&init_mm, page_to_virt(pgtable));

> +	return -ENOMEM;
> +}
> +
> +static void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> +{
> +	pgtable_t pgtable;
> +	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> +
> +	if (!nr)
> +		return;

We can get rid of "nr" and its check and keep only the check below, right?
AFAICS, they go together, e.g: if page_huge_pte does not return null,
it means that we preallocated a pagetable, and viceversa.


> +
> +	pgtable = page_huge_pte(page);
> +	if (!pgtable)
> +		return;
> +
> +	while (nr-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, page_to_virt(pgtable));

	Same as above, that "nr" can go?
Muchun Song Nov. 10, 2020, 3:49 a.m. UTC | #2
On Tue, Nov 10, 2020 at 1:21 AM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Sun, Nov 08, 2020 at 10:10:57PM +0800, Muchun Song wrote:
> > +static inline unsigned int pgtable_pages_to_prealloc_per_hpage(struct hstate *h)
> > +{
> > +     unsigned long vmemmap_size = vmemmap_pages_size_per_hpage(h);
> > +
> > +     /*
> > +      * No need pre-allocate page tabels when there is no vmemmap pages
> > +      * to free.
>  s /tabels/tables/

Thanks.

>
> > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > +{
> > +     int i;
> > +     pgtable_t pgtable;
> > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > +
> > +     if (!nr)
> > +             return 0;
> > +
> > +     vmemmap_pgtable_init(page);
> > +
> > +     for (i = 0; i < nr; i++) {
> > +             pte_t *pte_p;
> > +
> > +             pte_p = pte_alloc_one_kernel(&init_mm);
> > +             if (!pte_p)
> > +                     goto out;
> > +             vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> > +     }
> > +
> > +     return 0;
> > +out:
> > +     while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
>
>         would not be enough to:
>
>         while (pgtable = vmemmap_pgtable_withdrag(page))
>                 pte_free_kernel(&init_mm, page_to_virt(pgtable));

The vmemmap_pgtable_withdraw can not return NULL. So we can not
drop the "i--".

>
> > +     return -ENOMEM;
> > +}
> > +
> > +static void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> > +{
> > +     pgtable_t pgtable;
> > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > +
> > +     if (!nr)
> > +             return;
>
> We can get rid of "nr" and its check and keep only the check below, right?

Great, the check can go away.

> AFAICS, they go together, e.g: if page_huge_pte does not return null,
> it means that we preallocated a pagetable, and viceversa.
>
>
> > +
> > +     pgtable = page_huge_pte(page);
> > +     if (!pgtable)
> > +             return;
> > +
> > +     while (nr-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
>
>         Same as above, that "nr" can go?

Here "nr" can not go. Because the vmemmap_pgtable_withdraw can
not return NULL.

Thanks.

>
> --
> Oscar Salvador
> SUSE L3
Oscar Salvador Nov. 10, 2020, 5:42 a.m. UTC | #3
On Tue, Nov 10, 2020 at 11:49:27AM +0800, Muchun Song wrote:
> On Tue, Nov 10, 2020 at 1:21 AM Oscar Salvador <osalvador@suse.de> wrote:
> >
> > On Sun, Nov 08, 2020 at 10:10:57PM +0800, Muchun Song wrote:
> > > +static inline unsigned int pgtable_pages_to_prealloc_per_hpage(struct hstate *h)
> > > +{
> > > +     unsigned long vmemmap_size = vmemmap_pages_size_per_hpage(h);
> > > +
> > > +     /*
> > > +      * No need pre-allocate page tabels when there is no vmemmap pages
> > > +      * to free.
> >  s /tabels/tables/
> 
> Thanks.
> 
> >
> > > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > > +{
> > > +     int i;
> > > +     pgtable_t pgtable;
> > > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > > +
> > > +     if (!nr)
> > > +             return 0;
> > > +
> > > +     vmemmap_pgtable_init(page);
> > > +
> > > +     for (i = 0; i < nr; i++) {
> > > +             pte_t *pte_p;
> > > +
> > > +             pte_p = pte_alloc_one_kernel(&init_mm);
> > > +             if (!pte_p)
> > > +                     goto out;
> > > +             vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> > > +     }
> > > +
> > > +     return 0;
> > > +out:
> > > +     while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
> >
> >         would not be enough to:
> >
> >         while (pgtable = vmemmap_pgtable_withdrag(page))
> >                 pte_free_kernel(&init_mm, page_to_virt(pgtable));
> 
> The vmemmap_pgtable_withdraw can not return NULL. So we can not
> drop the "i--".

Yeah, you are right, I managed to confuse myself.
But why not make it return null, something like:

static pgtable_t vmemmap_pgtable_withdraw(struct page *page)
{
	pgtable_t pgtable;

	/* FIFO */
	pgtable = page_huge_pte(page);
	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
						       struct page, lru);
	if (page_huge_pte(page))
		list_del(&pgtable->lru);

	return page_huge_pte(page) ? pgtable : NULL;
}

What do you think?
Muchun Song Nov. 10, 2020, 6:08 a.m. UTC | #4
On Tue, Nov 10, 2020 at 1:42 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Tue, Nov 10, 2020 at 11:49:27AM +0800, Muchun Song wrote:
> > On Tue, Nov 10, 2020 at 1:21 AM Oscar Salvador <osalvador@suse.de> wrote:
> > >
> > > On Sun, Nov 08, 2020 at 10:10:57PM +0800, Muchun Song wrote:
> > > > +static inline unsigned int pgtable_pages_to_prealloc_per_hpage(struct hstate *h)
> > > > +{
> > > > +     unsigned long vmemmap_size = vmemmap_pages_size_per_hpage(h);
> > > > +
> > > > +     /*
> > > > +      * No need pre-allocate page tabels when there is no vmemmap pages
> > > > +      * to free.
> > >  s /tabels/tables/
> >
> > Thanks.
> >
> > >
> > > > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > > > +{
> > > > +     int i;
> > > > +     pgtable_t pgtable;
> > > > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > > > +
> > > > +     if (!nr)
> > > > +             return 0;
> > > > +
> > > > +     vmemmap_pgtable_init(page);
> > > > +
> > > > +     for (i = 0; i < nr; i++) {
> > > > +             pte_t *pte_p;
> > > > +
> > > > +             pte_p = pte_alloc_one_kernel(&init_mm);
> > > > +             if (!pte_p)
> > > > +                     goto out;
> > > > +             vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> > > > +     }
> > > > +
> > > > +     return 0;
> > > > +out:
> > > > +     while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > > > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
> > >
> > >         would not be enough to:
> > >
> > >         while (pgtable = vmemmap_pgtable_withdrag(page))
> > >                 pte_free_kernel(&init_mm, page_to_virt(pgtable));
> >
> > The vmemmap_pgtable_withdraw can not return NULL. So we can not
> > drop the "i--".
>
> Yeah, you are right, I managed to confuse myself.
> But why not make it return null, something like:
>
> static pgtable_t vmemmap_pgtable_withdraw(struct page *page)
> {
>         pgtable_t pgtable;
>
>         /* FIFO */
>         pgtable = page_huge_pte(page);

The check should be added here.

           if (!pgtable)
                   return NULL;

Just like my previous v2 patch does. In this case, we can drop those
checks. What do you think?

>         page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
>                                                        struct page, lru);
>         if (page_huge_pte(page))
>                 list_del(&pgtable->lru);
>
>         return page_huge_pte(page) ? pgtable : NULL;
> }
>
> What do you think?
>
>
> --
> Oscar Salvador
> SUSE L3
Oscar Salvador Nov. 10, 2020, 6:33 a.m. UTC | #5
On Tue, Nov 10, 2020 at 02:08:46PM +0800, Muchun Song wrote:
> The check should be added here.
> 
>            if (!pgtable)
>                    return NULL;
> 
> Just like my previous v2 patch does. In this case, we can drop those
> checks. What do you think?

It is too early for me, so bear with me.

page_huge_pte will only return NULL in case we did not get to preallocate
any pgtable right?

What I was talimg about is that 
> 
> >         page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> >                                                        struct page, lru);

here we will get the either a pgtable entry or NULL in case we already consumed
all entries from the list.
If that is the case, we can return NULL and let the caller known that we
are done.

Am I missing anything?
Muchun Song Nov. 10, 2020, 7:10 a.m. UTC | #6
On Tue, Nov 10, 2020 at 2:33 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Tue, Nov 10, 2020 at 02:08:46PM +0800, Muchun Song wrote:
> > The check should be added here.
> >
> >            if (!pgtable)
> >                    return NULL;
> >
> > Just like my previous v2 patch does. In this case, we can drop those
> > checks. What do you think?
>
> It is too early for me, so bear with me.
>
> page_huge_pte will only return NULL in case we did not get to preallocate
> any pgtable right?

The page_huge_pte only returns NULL when we did consume the
page tables. Not each HugeTLB page need to split the vmemmap
page tables. We preallocate page tables for each HugeTLB page,
if we do not need to split PMD. We should free the preallocated
page tables.

Maybe you can see the comments of the other thread.

  [PATCH v3 09/21] mm/hugetlb: Free the vmemmap pages associated with
each hugetlb page

Thanks.

>
> What I was talimg about is that
> >
> > >         page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> > >                                                        struct page, lru);
>
> here we will get the either a pgtable entry or NULL in case we already consumed
> all entries from the list.
> If that is the case, we can return NULL and let the caller known that we
> are done.
>
> Am I missing anything?


>
>
> --
> Oscar Salvador
> SUSE L3



--
Yours,
Muchun
Mike Kravetz Nov. 11, 2020, 12:47 a.m. UTC | #7
On 11/8/20 6:10 AM, Muchun Song wrote:
> On x86_64, vmemmap is always PMD mapped if the machine has hugepages
> support and if we have 2MB contiguos pages and PMD aligned. If we want
> to free the unused vmemmap pages, we have to split the huge pmd firstly.
> So we should pre-allocate pgtable to split PMD to PTE.
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a0007902fafb..5c7be2ee7e15 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1303,6 +1303,108 @@ static inline void destroy_compound_gigantic_page(struct page *page,
...
> +static inline void vmemmap_pgtable_init(struct page *page)
> +{
> +	page_huge_pte(page) = NULL;
> +}
> +
> +static void vmemmap_pgtable_deposit(struct page *page, pgtable_t pgtable)
> +{
> +	/* FIFO */
> +	if (!page_huge_pte(page))
> +		INIT_LIST_HEAD(&pgtable->lru);
> +	else
> +		list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> +	page_huge_pte(page) = pgtable;
> +}
> +
> +static pgtable_t vmemmap_pgtable_withdraw(struct page *page)
> +{
> +	pgtable_t pgtable;
> +
> +	/* FIFO */
> +	pgtable = page_huge_pte(page);
> +	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> +						       struct page, lru);
> +	if (page_huge_pte(page))
> +		list_del(&pgtable->lru);
> +	return pgtable;
> +}
> +
> +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> +{
> +	int i;
> +	pgtable_t pgtable;
> +	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> +
> +	if (!nr)
> +		return 0;
> +
> +	vmemmap_pgtable_init(page);
> +
> +	for (i = 0; i < nr; i++) {
> +		pte_t *pte_p;
> +
> +		pte_p = pte_alloc_one_kernel(&init_mm);
> +		if (!pte_p)
> +			goto out;
> +		vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> +	}
> +
> +	return 0;
> +out:
> +	while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, page_to_virt(pgtable));
> +	return -ENOMEM;
> +}
> +
> +static void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> +{
> +	pgtable_t pgtable;
> +	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> +
> +	if (!nr)
> +		return;
> +
> +	pgtable = page_huge_pte(page);
> +	if (!pgtable)
> +		return;
> +
> +	while (nr-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> +		pte_free_kernel(&init_mm, page_to_virt(pgtable));
> +}

I may be confused.

In patch 9 of this series, the first call to vmemmap_pgtable_free() is made:

> @@ -1645,6 +1799,10 @@ void free_huge_page(struct page *page)
>  
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  {
> +	free_huge_page_vmemmap(h, page);
> +	/* Must be called before the initialization of @page->lru */
> +	vmemmap_pgtable_free(h, page);
> +
>  	INIT_LIST_HEAD(&page->lru);
>  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>  	set_hugetlb_cgroup(page, NULL);

When I saw that comment in previous patch series, I assumed page->lru was
being used to store preallocated pages and pages to free.  However, unless
I am reading the code incorrectly it does not appear page->lru (of the huge
page) is being used for this purpose.  Is that correct?

If it is correct, would using page->lru of the huge page make this code
simpler?  I am just missing the reason why you are using
page_huge_pte(page)->lru
Muchun Song Nov. 11, 2020, 3:41 a.m. UTC | #8
On Wed, Nov 11, 2020 at 8:47 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 11/8/20 6:10 AM, Muchun Song wrote:
> > On x86_64, vmemmap is always PMD mapped if the machine has hugepages
> > support and if we have 2MB contiguos pages and PMD aligned. If we want
> > to free the unused vmemmap pages, we have to split the huge pmd firstly.
> > So we should pre-allocate pgtable to split PMD to PTE.
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index a0007902fafb..5c7be2ee7e15 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1303,6 +1303,108 @@ static inline void destroy_compound_gigantic_page(struct page *page,
> ...
> > +static inline void vmemmap_pgtable_init(struct page *page)
> > +{
> > +     page_huge_pte(page) = NULL;
> > +}
> > +
> > +static void vmemmap_pgtable_deposit(struct page *page, pgtable_t pgtable)
> > +{
> > +     /* FIFO */
> > +     if (!page_huge_pte(page))
> > +             INIT_LIST_HEAD(&pgtable->lru);
> > +     else
> > +             list_add(&pgtable->lru, &page_huge_pte(page)->lru);
> > +     page_huge_pte(page) = pgtable;
> > +}
> > +
> > +static pgtable_t vmemmap_pgtable_withdraw(struct page *page)
> > +{
> > +     pgtable_t pgtable;
> > +
> > +     /* FIFO */
> > +     pgtable = page_huge_pte(page);
> > +     page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
> > +                                                    struct page, lru);
> > +     if (page_huge_pte(page))
> > +             list_del(&pgtable->lru);
> > +     return pgtable;
> > +}
> > +
> > +static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> > +{
> > +     int i;
> > +     pgtable_t pgtable;
> > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > +
> > +     if (!nr)
> > +             return 0;
> > +
> > +     vmemmap_pgtable_init(page);
> > +
> > +     for (i = 0; i < nr; i++) {
> > +             pte_t *pte_p;
> > +
> > +             pte_p = pte_alloc_one_kernel(&init_mm);
> > +             if (!pte_p)
> > +                     goto out;
> > +             vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
> > +     }
> > +
> > +     return 0;
> > +out:
> > +     while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
> > +     return -ENOMEM;
> > +}
> > +
> > +static void vmemmap_pgtable_free(struct hstate *h, struct page *page)
> > +{
> > +     pgtable_t pgtable;
> > +     unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> > +
> > +     if (!nr)
> > +             return;
> > +
> > +     pgtable = page_huge_pte(page);
> > +     if (!pgtable)
> > +             return;
> > +
> > +     while (nr-- && (pgtable = vmemmap_pgtable_withdraw(page)))
> > +             pte_free_kernel(&init_mm, page_to_virt(pgtable));
> > +}
>
> I may be confused.
>
> In patch 9 of this series, the first call to vmemmap_pgtable_free() is made:
>
> > @@ -1645,6 +1799,10 @@ void free_huge_page(struct page *page)
> >
> >  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >  {
> > +     free_huge_page_vmemmap(h, page);
> > +     /* Must be called before the initialization of @page->lru */
> > +     vmemmap_pgtable_free(h, page);
> > +
> >       INIT_LIST_HEAD(&page->lru);
> >       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> >       set_hugetlb_cgroup(page, NULL);
>
> When I saw that comment in previous patch series, I assumed page->lru was
> being used to store preallocated pages and pages to free.  However, unless

Yeah, you are right.

> I am reading the code incorrectly it does not appear page->lru (of the huge
> page) is being used for this purpose.  Is that correct?
>
> If it is correct, would using page->lru of the huge page make this code
> simpler?  I am just missing the reason why you are using
> page_huge_pte(page)->lru

For 1GB HugeTLB pages, we should pre-allocate more than one page
table. So I use a linked list. The page_huge_pte(page) is the list head.
Because the page->lru shares storage with page->pmd_huge_pte.

+     /* Must be called before the initialization of @page->lru */
+     vmemmap_pgtable_free(h, page);
+
       INIT_LIST_HEAD(&page->lru);

Here we initialize the lru. So the vmemmap_pgtable_free should
be called before this. It seems like that I should point out this "share"
in the comment.

Thanks.

>
> --
> Mike Kravetz
Mike Kravetz Nov. 13, 2020, 12:35 a.m. UTC | #9
On 11/10/20 7:41 PM, Muchun Song wrote:
> On Wed, Nov 11, 2020 at 8:47 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 11/8/20 6:10 AM, Muchun Song wrote:
>> I am reading the code incorrectly it does not appear page->lru (of the huge
>> page) is being used for this purpose.  Is that correct?
>>
>> If it is correct, would using page->lru of the huge page make this code
>> simpler?  I am just missing the reason why you are using
>> page_huge_pte(page)->lru
> 
> For 1GB HugeTLB pages, we should pre-allocate more than one page
> table. So I use a linked list. The page_huge_pte(page) is the list head.
> Because the page->lru shares storage with page->pmd_huge_pte.

Sorry, but I do not understand the statement page->lru shares storage with
page->pmd_huge_pte.  Are you saying they are both in head struct page of
the huge page?

Here is what I was suggesting.  If we just use page->lru for the list
then vmemmap_pgtable_prealloc() could be coded like the following:

static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
{
	struct page *pte_page, *t_page;
	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);

	if (!nr)
		return 0;

	/* Store preallocated pages on huge page lru list */
	INIT_LIST_HEAD(&page->lru);

	while (nr--) {
		pte_t *pte_p;

		pte_p = pte_alloc_one_kernel(&init_mm);
		if (!pte_p)
			goto out;
		list_add(&virt_to_page(pte_p)->lru, &page->lru);
	}

	return 0;
out:
	list_for_each_entry_safe(pte_page, t_page, &page->lru, lru)
		pte_free_kernel(&init_mm, page_to_virt(pte_page));
	return -ENOMEM;
}

By doing this we could eliminate the routines,
vmemmap_pgtable_init()
vmemmap_pgtable_deposit()
vmemmap_pgtable_withdraw()
and simply use the list manipulation routines.

To me, that looks simpler than the proposed code in this patch.
Mike Kravetz Nov. 13, 2020, 1:02 a.m. UTC | #10
On 11/12/20 4:35 PM, Mike Kravetz wrote:
> On 11/10/20 7:41 PM, Muchun Song wrote:
>> On Wed, Nov 11, 2020 at 8:47 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>
>>> On 11/8/20 6:10 AM, Muchun Song wrote:
>>> I am reading the code incorrectly it does not appear page->lru (of the huge
>>> page) is being used for this purpose.  Is that correct?
>>>
>>> If it is correct, would using page->lru of the huge page make this code
>>> simpler?  I am just missing the reason why you are using
>>> page_huge_pte(page)->lru
>>
>> For 1GB HugeTLB pages, we should pre-allocate more than one page
>> table. So I use a linked list. The page_huge_pte(page) is the list head.
>> Because the page->lru shares storage with page->pmd_huge_pte.
> 
> Sorry, but I do not understand the statement page->lru shares storage with
> page->pmd_huge_pte.  Are you saying they are both in head struct page of
> the huge page?
> 
> Here is what I was suggesting.  If we just use page->lru for the list
> then vmemmap_pgtable_prealloc() could be coded like the following:
> 
> static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> {
> 	struct page *pte_page, *t_page;
> 	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
> 
> 	if (!nr)
> 		return 0;
> 
> 	/* Store preallocated pages on huge page lru list */
> 	INIT_LIST_HEAD(&page->lru);
> 
> 	while (nr--) {
> 		pte_t *pte_p;
> 
> 		pte_p = pte_alloc_one_kernel(&init_mm);
> 		if (!pte_p)
> 			goto out;
> 		list_add(&virt_to_page(pte_p)->lru, &page->lru);
> 	}
> 
> 	return 0;
> out:
> 	list_for_each_entry_safe(pte_page, t_page, &page->lru, lru)

Forgot the list_del(&pte_page->lru)
Perhaps it is not simpler after all. :)
Muchun Song Nov. 13, 2020, 4:18 a.m. UTC | #11
On Fri, Nov 13, 2020 at 8:38 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 11/10/20 7:41 PM, Muchun Song wrote:
> > On Wed, Nov 11, 2020 at 8:47 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>
> >> On 11/8/20 6:10 AM, Muchun Song wrote:
> >> I am reading the code incorrectly it does not appear page->lru (of the huge
> >> page) is being used for this purpose.  Is that correct?
> >>
> >> If it is correct, would using page->lru of the huge page make this code
> >> simpler?  I am just missing the reason why you are using
> >> page_huge_pte(page)->lru
> >
> > For 1GB HugeTLB pages, we should pre-allocate more than one page
> > table. So I use a linked list. The page_huge_pte(page) is the list head.
> > Because the page->lru shares storage with page->pmd_huge_pte.
>
> Sorry, but I do not understand the statement page->lru shares storage with
> page->pmd_huge_pte.  Are you saying they are both in head struct page of
> the huge page?
>
> Here is what I was suggesting.  If we just use page->lru for the list
> then vmemmap_pgtable_prealloc() could be coded like the following:
>
> static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
> {
>         struct page *pte_page, *t_page;
>         unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
>
>         if (!nr)
>                 return 0;
>
>         /* Store preallocated pages on huge page lru list */
>         INIT_LIST_HEAD(&page->lru);
>
>         while (nr--) {
>                 pte_t *pte_p;
>
>                 pte_p = pte_alloc_one_kernel(&init_mm);
>                 if (!pte_p)
>                         goto out;
>                 list_add(&virt_to_page(pte_p)->lru, &page->lru);
>         }
>
>         return 0;
> out:
>         list_for_each_entry_safe(pte_page, t_page, &page->lru, lru)
>                 pte_free_kernel(&init_mm, page_to_virt(pte_page));
>         return -ENOMEM;
> }
>
> By doing this we could eliminate the routines,
> vmemmap_pgtable_init()
> vmemmap_pgtable_deposit()
> vmemmap_pgtable_withdraw()
> and simply use the list manipulation routines.

Now I know what you mean. Yeah, just use page->lru can make code
simply. Thanks for your suggestions.

>
> To me, that looks simpler than the proposed code in this patch.
> --
> Mike Kravetz
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index eed3dd3bd626..d81c262418db 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -593,6 +593,16 @@  static inline unsigned int blocks_per_huge_page(struct hstate *h)
 
 #include <asm/hugetlb.h>
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifndef VMEMMAP_HPAGE_SHIFT
+#define VMEMMAP_HPAGE_SHIFT		HPAGE_SHIFT
+#endif
+#define VMEMMAP_HPAGE_ORDER		(VMEMMAP_HPAGE_SHIFT - PAGE_SHIFT)
+#define VMEMMAP_HPAGE_NR		(1 << VMEMMAP_HPAGE_ORDER)
+#define VMEMMAP_HPAGE_SIZE		((1UL) << VMEMMAP_HPAGE_SHIFT)
+#define VMEMMAP_HPAGE_MASK		(~(VMEMMAP_HPAGE_SIZE - 1))
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+
 #ifndef is_hugepage_only_range
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					unsigned long addr, unsigned long len)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a0007902fafb..5c7be2ee7e15 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1303,6 +1303,108 @@  static inline void destroy_compound_gigantic_page(struct page *page,
  */
 #define RESERVE_VMEMMAP_NR	2U
 
+#define page_huge_pte(page)	((page)->pmd_huge_pte)
+
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+	return h->nr_free_vmemmap_pages;
+}
+
+static inline unsigned int vmemmap_pages_per_hpage(struct hstate *h)
+{
+	return free_vmemmap_pages_per_hpage(h) + RESERVE_VMEMMAP_NR;
+}
+
+static inline unsigned long vmemmap_pages_size_per_hpage(struct hstate *h)
+{
+	return (unsigned long)vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+}
+
+static inline unsigned int pgtable_pages_to_prealloc_per_hpage(struct hstate *h)
+{
+	unsigned long vmemmap_size = vmemmap_pages_size_per_hpage(h);
+
+	/*
+	 * No need pre-allocate page tabels when there is no vmemmap pages
+	 * to free.
+	 */
+	if (!free_vmemmap_pages_per_hpage(h))
+		return 0;
+
+	return ALIGN(vmemmap_size, VMEMMAP_HPAGE_SIZE) >> VMEMMAP_HPAGE_SHIFT;
+}
+
+static inline void vmemmap_pgtable_init(struct page *page)
+{
+	page_huge_pte(page) = NULL;
+}
+
+static void vmemmap_pgtable_deposit(struct page *page, pgtable_t pgtable)
+{
+	/* FIFO */
+	if (!page_huge_pte(page))
+		INIT_LIST_HEAD(&pgtable->lru);
+	else
+		list_add(&pgtable->lru, &page_huge_pte(page)->lru);
+	page_huge_pte(page) = pgtable;
+}
+
+static pgtable_t vmemmap_pgtable_withdraw(struct page *page)
+{
+	pgtable_t pgtable;
+
+	/* FIFO */
+	pgtable = page_huge_pte(page);
+	page_huge_pte(page) = list_first_entry_or_null(&pgtable->lru,
+						       struct page, lru);
+	if (page_huge_pte(page))
+		list_del(&pgtable->lru);
+	return pgtable;
+}
+
+static int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
+{
+	int i;
+	pgtable_t pgtable;
+	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
+
+	if (!nr)
+		return 0;
+
+	vmemmap_pgtable_init(page);
+
+	for (i = 0; i < nr; i++) {
+		pte_t *pte_p;
+
+		pte_p = pte_alloc_one_kernel(&init_mm);
+		if (!pte_p)
+			goto out;
+		vmemmap_pgtable_deposit(page, virt_to_page(pte_p));
+	}
+
+	return 0;
+out:
+	while (i-- && (pgtable = vmemmap_pgtable_withdraw(page)))
+		pte_free_kernel(&init_mm, page_to_virt(pgtable));
+	return -ENOMEM;
+}
+
+static void vmemmap_pgtable_free(struct hstate *h, struct page *page)
+{
+	pgtable_t pgtable;
+	unsigned int nr = pgtable_pages_to_prealloc_per_hpage(h);
+
+	if (!nr)
+		return;
+
+	pgtable = page_huge_pte(page);
+	if (!pgtable)
+		return;
+
+	while (nr-- && (pgtable = vmemmap_pgtable_withdraw(page)))
+		pte_free_kernel(&init_mm, page_to_virt(pgtable));
+}
+
 static void __init hugetlb_vmemmap_init(struct hstate *h)
 {
 	unsigned int order = huge_page_order(h);
@@ -1326,6 +1428,15 @@  static void __init hugetlb_vmemmap_init(struct hstate *h)
 static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
 }
+
+static inline int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page)
+{
+	return 0;
+}
+
+static inline void vmemmap_pgtable_free(struct hstate *h, struct page *page)
+{
+}
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)