diff mbox series

[v7,04/15] mm/hugetlb: Introduce nr_free_vmemmap_pages in the struct hstate

Message ID 20201130151838.11208-5-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Free some vmemmap pages of hugetlb page | expand

Commit Message

Muchun Song Nov. 30, 2020, 3:18 p.m. UTC
Every HugeTLB has more than one struct page structure. The 2M HugeTLB
has 512 struct page structure and 1G HugeTLB has 4096 struct page
structures. We __know__ that we only use the first 4(HUGETLB_CGROUP_MIN_ORDER)
struct page structures to store metadata associated with each HugeTLB.

There are a lot of struct page structures(8 page frames for 2MB HugeTLB
page and 4096 page frames for 1GB HugeTLB page) associated with each
HugeTLB page. For tail pages, the value of compound_head is the same.
So we can reuse first page of tail page structures. We map the virtual
addresses of the remaining pages of tail page structures to the first
tail page struct, and then free these page frames. Therefore, we need
to reserve two pages as vmemmap areas.

So we introduce a new nr_free_vmemmap_pages field in the hstate to
indicate how many vmemmap pages associated with a HugeTLB page that we
can free to buddy system.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/hugetlb.h |   3 ++
 mm/Makefile             |   1 +
 mm/hugetlb.c            |   3 ++
 mm/hugetlb_vmemmap.c    | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 mm/hugetlb_vmemmap.h    |  20 ++++++++
 5 files changed, 156 insertions(+)
 create mode 100644 mm/hugetlb_vmemmap.c
 create mode 100644 mm/hugetlb_vmemmap.h

Comments

David Hildenbrand Dec. 7, 2020, 12:36 p.m. UTC | #1
On 30.11.20 16:18, Muchun Song wrote:
> Every HugeTLB has more than one struct page structure. The 2M HugeTLB
> has 512 struct page structure and 1G HugeTLB has 4096 struct page
> structures. We __know__ that we only use the first 4(HUGETLB_CGROUP_MIN_ORDER)
> struct page structures to store metadata associated with each HugeTLB.
> 
> There are a lot of struct page structures(8 page frames for 2MB HugeTLB
> page and 4096 page frames for 1GB HugeTLB page) associated with each
> HugeTLB page. For tail pages, the value of compound_head is the same.
> So we can reuse first page of tail page structures. We map the virtual
> addresses of the remaining pages of tail page structures to the first
> tail page struct, and then free these page frames. Therefore, we need
> to reserve two pages as vmemmap areas.
> 
> So we introduce a new nr_free_vmemmap_pages field in the hstate to
> indicate how many vmemmap pages associated with a HugeTLB page that we
> can free to buddy system.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  include/linux/hugetlb.h |   3 ++
>  mm/Makefile             |   1 +
>  mm/hugetlb.c            |   3 ++
>  mm/hugetlb_vmemmap.c    | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/hugetlb_vmemmap.h    |  20 ++++++++
>  5 files changed, 156 insertions(+)
>  create mode 100644 mm/hugetlb_vmemmap.c
>  create mode 100644 mm/hugetlb_vmemmap.h
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ebca2ef02212..4efeccb7192c 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -492,6 +492,9 @@ struct hstate {
>  	unsigned int nr_huge_pages_node[MAX_NUMNODES];
>  	unsigned int free_huge_pages_node[MAX_NUMNODES];
>  	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +	unsigned int nr_free_vmemmap_pages;
> +#endif
>  #ifdef CONFIG_CGROUP_HUGETLB
>  	/* cgroup control files */
>  	struct cftype cgroup_files_dfl[7];
> diff --git a/mm/Makefile b/mm/Makefile
> index ed4b88fa0f5e..056801d8daae 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -71,6 +71,7 @@ obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
>  obj-$(CONFIG_ZSWAP)	+= zswap.o
>  obj-$(CONFIG_HAS_DMA)	+= dmapool.o
>  obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
> +obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
>  obj-$(CONFIG_NUMA) 	+= mempolicy.o
>  obj-$(CONFIG_SPARSEMEM)	+= sparse.o
>  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1f3bf1710b66..25f9e8e9fc4a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -42,6 +42,7 @@
>  #include <linux/userfaultfd_k.h>
>  #include <linux/page_owner.h>
>  #include "internal.h"
> +#include "hugetlb_vmemmap.h"
>  
>  int hugetlb_max_hstate __read_mostly;
>  unsigned int default_hstate_idx;
> @@ -3206,6 +3207,8 @@ void __init hugetlb_add_hstate(unsigned int order)
>  	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
>  					huge_page_size(h)/1024);
>  
> +	hugetlb_vmemmap_init(h);
> +
>  	parsed_hstate = h;
>  }
>  
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> new file mode 100644
> index 000000000000..51152e258f39
> --- /dev/null
> +++ b/mm/hugetlb_vmemmap.c
> @@ -0,0 +1,129 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Free some vmemmap pages of HugeTLB
> + *
> + * Copyright (c) 2020, Bytedance. All rights reserved.
> + *
> + *     Author: Muchun Song <songmuchun@bytedance.com>
> + *
> + * The struct page structures (page structs) are used to describe a physical
> + * page frame. By default, there is a one-to-one mapping from a page frame to
> + * it's corresponding page struct.
> + *
> + * The HugeTLB pages consist of multiple base page size pages and is supported
> + * by many architectures. See hugetlbpage.rst in the Documentation directory
> + * for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB
> + * are currently supported. Since the base page size on x86 is 4KB, a 2MB
> + * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
> + * 4096 base pages. For each base page, there is a corresponding page struct.
> + *
> + * Within the HugeTLB subsystem, only the first 4 page structs are used to
> + * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
> + * provides this upper limit. The only 'useful' information in the remaining
> + * page structs is the compound_head field, and this field is the same for all
> + * tail pages.
> + *
> + * By removing redundant page structs for HugeTLB pages, memory can returned to
> + * the buddy allocator for other uses.
> + *
> + * When the system boot up, every 2M HugeTLB has 512 struct page structs which
> + * size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).


You should try to generalize all descriptions regarding differing base
page sizes. E.g., arm64 supports 4k, 16k, and 64k base pages.

[...]

> @@ -0,0 +1,20 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Free some vmemmap pages of HugeTLB
> + *
> + * Copyright (c) 2020, Bytedance. All rights reserved.
> + *
> + *     Author: Muchun Song <songmuchun@bytedance.com>
> + */
> +#ifndef _LINUX_HUGETLB_VMEMMAP_H
> +#define _LINUX_HUGETLB_VMEMMAP_H
> +#include <linux/hugetlb.h>
> +
> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> +void __init hugetlb_vmemmap_init(struct hstate *h);
> +#else
> +static inline void hugetlb_vmemmap_init(struct hstate *h)
> +{
> +}
> +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> +#endif /* _LINUX_HUGETLB_VMEMMAP_H */
> 

This patch as it stands is rather sub-optimal. I mean, all it does is
add documentation and print what could be done.

Can we instead introduce the basic infrastructure and enable it via this
patch on top, where we glue all the pieces together? Or is there
something I am missing?
Muchun Song Dec. 7, 2020, 1:11 p.m. UTC | #2
On Mon, Dec 7, 2020 at 8:36 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 30.11.20 16:18, Muchun Song wrote:
> > Every HugeTLB has more than one struct page structure. The 2M HugeTLB
> > has 512 struct page structure and 1G HugeTLB has 4096 struct page
> > structures. We __know__ that we only use the first 4(HUGETLB_CGROUP_MIN_ORDER)
> > struct page structures to store metadata associated with each HugeTLB.
> >
> > There are a lot of struct page structures(8 page frames for 2MB HugeTLB
> > page and 4096 page frames for 1GB HugeTLB page) associated with each
> > HugeTLB page. For tail pages, the value of compound_head is the same.
> > So we can reuse first page of tail page structures. We map the virtual
> > addresses of the remaining pages of tail page structures to the first
> > tail page struct, and then free these page frames. Therefore, we need
> > to reserve two pages as vmemmap areas.
> >
> > So we introduce a new nr_free_vmemmap_pages field in the hstate to
> > indicate how many vmemmap pages associated with a HugeTLB page that we
> > can free to buddy system.
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >  include/linux/hugetlb.h |   3 ++
> >  mm/Makefile             |   1 +
> >  mm/hugetlb.c            |   3 ++
> >  mm/hugetlb_vmemmap.c    | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  mm/hugetlb_vmemmap.h    |  20 ++++++++
> >  5 files changed, 156 insertions(+)
> >  create mode 100644 mm/hugetlb_vmemmap.c
> >  create mode 100644 mm/hugetlb_vmemmap.h
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index ebca2ef02212..4efeccb7192c 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -492,6 +492,9 @@ struct hstate {
> >       unsigned int nr_huge_pages_node[MAX_NUMNODES];
> >       unsigned int free_huge_pages_node[MAX_NUMNODES];
> >       unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +     unsigned int nr_free_vmemmap_pages;
> > +#endif
> >  #ifdef CONFIG_CGROUP_HUGETLB
> >       /* cgroup control files */
> >       struct cftype cgroup_files_dfl[7];
> > diff --git a/mm/Makefile b/mm/Makefile
> > index ed4b88fa0f5e..056801d8daae 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -71,6 +71,7 @@ obj-$(CONFIG_FRONTSWAP)     += frontswap.o
> >  obj-$(CONFIG_ZSWAP)  += zswap.o
> >  obj-$(CONFIG_HAS_DMA)        += dmapool.o
> >  obj-$(CONFIG_HUGETLBFS)      += hugetlb.o
> > +obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)      += hugetlb_vmemmap.o
> >  obj-$(CONFIG_NUMA)   += mempolicy.o
> >  obj-$(CONFIG_SPARSEMEM)      += sparse.o
> >  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 1f3bf1710b66..25f9e8e9fc4a 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -42,6 +42,7 @@
> >  #include <linux/userfaultfd_k.h>
> >  #include <linux/page_owner.h>
> >  #include "internal.h"
> > +#include "hugetlb_vmemmap.h"
> >
> >  int hugetlb_max_hstate __read_mostly;
> >  unsigned int default_hstate_idx;
> > @@ -3206,6 +3207,8 @@ void __init hugetlb_add_hstate(unsigned int order)
> >       snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
> >                                       huge_page_size(h)/1024);
> >
> > +     hugetlb_vmemmap_init(h);
> > +
> >       parsed_hstate = h;
> >  }
> >
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > new file mode 100644
> > index 000000000000..51152e258f39
> > --- /dev/null
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -0,0 +1,129 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Free some vmemmap pages of HugeTLB
> > + *
> > + * Copyright (c) 2020, Bytedance. All rights reserved.
> > + *
> > + *     Author: Muchun Song <songmuchun@bytedance.com>
> > + *
> > + * The struct page structures (page structs) are used to describe a physical
> > + * page frame. By default, there is a one-to-one mapping from a page frame to
> > + * it's corresponding page struct.
> > + *
> > + * The HugeTLB pages consist of multiple base page size pages and is supported
> > + * by many architectures. See hugetlbpage.rst in the Documentation directory
> > + * for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB
> > + * are currently supported. Since the base page size on x86 is 4KB, a 2MB
> > + * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
> > + * 4096 base pages. For each base page, there is a corresponding page struct.
> > + *
> > + * Within the HugeTLB subsystem, only the first 4 page structs are used to
> > + * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
> > + * provides this upper limit. The only 'useful' information in the remaining
> > + * page structs is the compound_head field, and this field is the same for all
> > + * tail pages.
> > + *
> > + * By removing redundant page structs for HugeTLB pages, memory can returned to
> > + * the buddy allocator for other uses.
> > + *
> > + * When the system boot up, every 2M HugeTLB has 512 struct page structs which
> > + * size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).
>
>
> You should try to generalize all descriptions regarding differing base
> page sizes. E.g., arm64 supports 4k, 16k, and 64k base pages.

Will do. Thanks.

>
> [...]
>
> > @@ -0,0 +1,20 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Free some vmemmap pages of HugeTLB
> > + *
> > + * Copyright (c) 2020, Bytedance. All rights reserved.
> > + *
> > + *     Author: Muchun Song <songmuchun@bytedance.com>
> > + */
> > +#ifndef _LINUX_HUGETLB_VMEMMAP_H
> > +#define _LINUX_HUGETLB_VMEMMAP_H
> > +#include <linux/hugetlb.h>
> > +
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +void __init hugetlb_vmemmap_init(struct hstate *h);
> > +#else
> > +static inline void hugetlb_vmemmap_init(struct hstate *h)
> > +{
> > +}
> > +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> > +#endif /* _LINUX_HUGETLB_VMEMMAP_H */
> >
>
> This patch as it stands is rather sub-optimal. I mean, all it does is
> add documentation and print what could be done.
>
> Can we instead introduce the basic infrastructure and enable it via this
> patch on top, where we glue all the pieces together? Or is there
> something I am missing?

Maybe we can make the config of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
default n in the Kconfig. When everything is ready, then make it
default to y. Right?


>
> --
> Thanks,
>
> David / dhildenb
>


--
Yours,
Muchun
David Hildenbrand Dec. 9, 2020, 8:54 a.m. UTC | #3
On 07.12.20 14:11, Muchun Song wrote:
> On Mon, Dec 7, 2020 at 8:36 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 30.11.20 16:18, Muchun Song wrote:
>>> Every HugeTLB has more than one struct page structure. The 2M HugeTLB
>>> has 512 struct page structure and 1G HugeTLB has 4096 struct page
>>> structures. We __know__ that we only use the first 4(HUGETLB_CGROUP_MIN_ORDER)
>>> struct page structures to store metadata associated with each HugeTLB.
>>>
>>> There are a lot of struct page structures(8 page frames for 2MB HugeTLB
>>> page and 4096 page frames for 1GB HugeTLB page) associated with each
>>> HugeTLB page. For tail pages, the value of compound_head is the same.
>>> So we can reuse first page of tail page structures. We map the virtual
>>> addresses of the remaining pages of tail page structures to the first
>>> tail page struct, and then free these page frames. Therefore, we need
>>> to reserve two pages as vmemmap areas.
>>>
>>> So we introduce a new nr_free_vmemmap_pages field in the hstate to
>>> indicate how many vmemmap pages associated with a HugeTLB page that we
>>> can free to buddy system.
>>>
>>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>>> Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>  include/linux/hugetlb.h |   3 ++
>>>  mm/Makefile             |   1 +
>>>  mm/hugetlb.c            |   3 ++
>>>  mm/hugetlb_vmemmap.c    | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  mm/hugetlb_vmemmap.h    |  20 ++++++++
>>>  5 files changed, 156 insertions(+)
>>>  create mode 100644 mm/hugetlb_vmemmap.c
>>>  create mode 100644 mm/hugetlb_vmemmap.h
>>>
>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>> index ebca2ef02212..4efeccb7192c 100644
>>> --- a/include/linux/hugetlb.h
>>> +++ b/include/linux/hugetlb.h
>>> @@ -492,6 +492,9 @@ struct hstate {
>>>       unsigned int nr_huge_pages_node[MAX_NUMNODES];
>>>       unsigned int free_huge_pages_node[MAX_NUMNODES];
>>>       unsigned int surplus_huge_pages_node[MAX_NUMNODES];
>>> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
>>> +     unsigned int nr_free_vmemmap_pages;
>>> +#endif
>>>  #ifdef CONFIG_CGROUP_HUGETLB
>>>       /* cgroup control files */
>>>       struct cftype cgroup_files_dfl[7];
>>> diff --git a/mm/Makefile b/mm/Makefile
>>> index ed4b88fa0f5e..056801d8daae 100644
>>> --- a/mm/Makefile
>>> +++ b/mm/Makefile
>>> @@ -71,6 +71,7 @@ obj-$(CONFIG_FRONTSWAP)     += frontswap.o
>>>  obj-$(CONFIG_ZSWAP)  += zswap.o
>>>  obj-$(CONFIG_HAS_DMA)        += dmapool.o
>>>  obj-$(CONFIG_HUGETLBFS)      += hugetlb.o
>>> +obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)      += hugetlb_vmemmap.o
>>>  obj-$(CONFIG_NUMA)   += mempolicy.o
>>>  obj-$(CONFIG_SPARSEMEM)      += sparse.o
>>>  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 1f3bf1710b66..25f9e8e9fc4a 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -42,6 +42,7 @@
>>>  #include <linux/userfaultfd_k.h>
>>>  #include <linux/page_owner.h>
>>>  #include "internal.h"
>>> +#include "hugetlb_vmemmap.h"
>>>
>>>  int hugetlb_max_hstate __read_mostly;
>>>  unsigned int default_hstate_idx;
>>> @@ -3206,6 +3207,8 @@ void __init hugetlb_add_hstate(unsigned int order)
>>>       snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
>>>                                       huge_page_size(h)/1024);
>>>
>>> +     hugetlb_vmemmap_init(h);
>>> +
>>>       parsed_hstate = h;
>>>  }
>>>
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> new file mode 100644
>>> index 000000000000..51152e258f39
>>> --- /dev/null
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -0,0 +1,129 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Free some vmemmap pages of HugeTLB
>>> + *
>>> + * Copyright (c) 2020, Bytedance. All rights reserved.
>>> + *
>>> + *     Author: Muchun Song <songmuchun@bytedance.com>
>>> + *
>>> + * The struct page structures (page structs) are used to describe a physical
>>> + * page frame. By default, there is a one-to-one mapping from a page frame to
>>> + * it's corresponding page struct.
>>> + *
>>> + * The HugeTLB pages consist of multiple base page size pages and is supported
>>> + * by many architectures. See hugetlbpage.rst in the Documentation directory
>>> + * for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB
>>> + * are currently supported. Since the base page size on x86 is 4KB, a 2MB
>>> + * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
>>> + * 4096 base pages. For each base page, there is a corresponding page struct.
>>> + *
>>> + * Within the HugeTLB subsystem, only the first 4 page structs are used to
>>> + * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
>>> + * provides this upper limit. The only 'useful' information in the remaining
>>> + * page structs is the compound_head field, and this field is the same for all
>>> + * tail pages.
>>> + *
>>> + * By removing redundant page structs for HugeTLB pages, memory can returned to
>>> + * the buddy allocator for other uses.
>>> + *
>>> + * When the system boot up, every 2M HugeTLB has 512 struct page structs which
>>> + * size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).
>>
>>
>> You should try to generalize all descriptions regarding differing base
>> page sizes. E.g., arm64 supports 4k, 16k, and 64k base pages.
> 
> Will do. Thanks.
> 
>>
>> [...]
>>
>>> @@ -0,0 +1,20 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Free some vmemmap pages of HugeTLB
>>> + *
>>> + * Copyright (c) 2020, Bytedance. All rights reserved.
>>> + *
>>> + *     Author: Muchun Song <songmuchun@bytedance.com>
>>> + */
>>> +#ifndef _LINUX_HUGETLB_VMEMMAP_H
>>> +#define _LINUX_HUGETLB_VMEMMAP_H
>>> +#include <linux/hugetlb.h>
>>> +
>>> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
>>> +void __init hugetlb_vmemmap_init(struct hstate *h);
>>> +#else
>>> +static inline void hugetlb_vmemmap_init(struct hstate *h)
>>> +{
>>> +}
>>> +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
>>> +#endif /* _LINUX_HUGETLB_VMEMMAP_H */
>>>
>>
>> This patch as it stands is rather sub-optimal. I mean, all it does is
>> add documentation and print what could be done.
>>
>> Can we instead introduce the basic infrastructure and enable it via this
>> patch on top, where we glue all the pieces together? Or is there
>> something I am missing?
> 
> Maybe we can make the config of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> default n in the Kconfig. When everything is ready, then make it
> default to y. Right?

I think it can make sense to introduce the
CONFIG_HUGETLB_PAGE_FREE_VMEMMAP option first if necessary for other
patches. But I think the the documentation and the dummy call should
rather be moved to the end of the series where you glue everything you
introduced together and officially unlock the feature. Others might
disagree :)

BTW, I'm planning on reviewing the other parts of this series, I'm just
fairly busy, so it might take a while (I think we're targeting 5.12
either way as the 5.11 merge window will start fairly soon).
Muchun Song Dec. 9, 2020, 9:27 a.m. UTC | #4
On Wed, Dec 9, 2020 at 4:54 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 07.12.20 14:11, Muchun Song wrote:
> > On Mon, Dec 7, 2020 at 8:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 30.11.20 16:18, Muchun Song wrote:
> >>> Every HugeTLB has more than one struct page structure. The 2M HugeTLB
> >>> has 512 struct page structure and 1G HugeTLB has 4096 struct page
> >>> structures. We __know__ that we only use the first 4(HUGETLB_CGROUP_MIN_ORDER)
> >>> struct page structures to store metadata associated with each HugeTLB.
> >>>
> >>> There are a lot of struct page structures(8 page frames for 2MB HugeTLB
> >>> page and 4096 page frames for 1GB HugeTLB page) associated with each
> >>> HugeTLB page. For tail pages, the value of compound_head is the same.
> >>> So we can reuse first page of tail page structures. We map the virtual
> >>> addresses of the remaining pages of tail page structures to the first
> >>> tail page struct, and then free these page frames. Therefore, we need
> >>> to reserve two pages as vmemmap areas.
> >>>
> >>> So we introduce a new nr_free_vmemmap_pages field in the hstate to
> >>> indicate how many vmemmap pages associated with a HugeTLB page that we
> >>> can free to buddy system.
> >>>
> >>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> >>> Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
> >>> ---
> >>>  include/linux/hugetlb.h |   3 ++
> >>>  mm/Makefile             |   1 +
> >>>  mm/hugetlb.c            |   3 ++
> >>>  mm/hugetlb_vmemmap.c    | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>  mm/hugetlb_vmemmap.h    |  20 ++++++++
> >>>  5 files changed, 156 insertions(+)
> >>>  create mode 100644 mm/hugetlb_vmemmap.c
> >>>  create mode 100644 mm/hugetlb_vmemmap.h
> >>>
> >>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> >>> index ebca2ef02212..4efeccb7192c 100644
> >>> --- a/include/linux/hugetlb.h
> >>> +++ b/include/linux/hugetlb.h
> >>> @@ -492,6 +492,9 @@ struct hstate {
> >>>       unsigned int nr_huge_pages_node[MAX_NUMNODES];
> >>>       unsigned int free_huge_pages_node[MAX_NUMNODES];
> >>>       unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> >>> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> >>> +     unsigned int nr_free_vmemmap_pages;
> >>> +#endif
> >>>  #ifdef CONFIG_CGROUP_HUGETLB
> >>>       /* cgroup control files */
> >>>       struct cftype cgroup_files_dfl[7];
> >>> diff --git a/mm/Makefile b/mm/Makefile
> >>> index ed4b88fa0f5e..056801d8daae 100644
> >>> --- a/mm/Makefile
> >>> +++ b/mm/Makefile
> >>> @@ -71,6 +71,7 @@ obj-$(CONFIG_FRONTSWAP)     += frontswap.o
> >>>  obj-$(CONFIG_ZSWAP)  += zswap.o
> >>>  obj-$(CONFIG_HAS_DMA)        += dmapool.o
> >>>  obj-$(CONFIG_HUGETLBFS)      += hugetlb.o
> >>> +obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)      += hugetlb_vmemmap.o
> >>>  obj-$(CONFIG_NUMA)   += mempolicy.o
> >>>  obj-$(CONFIG_SPARSEMEM)      += sparse.o
> >>>  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> >>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >>> index 1f3bf1710b66..25f9e8e9fc4a 100644
> >>> --- a/mm/hugetlb.c
> >>> +++ b/mm/hugetlb.c
> >>> @@ -42,6 +42,7 @@
> >>>  #include <linux/userfaultfd_k.h>
> >>>  #include <linux/page_owner.h>
> >>>  #include "internal.h"
> >>> +#include "hugetlb_vmemmap.h"
> >>>
> >>>  int hugetlb_max_hstate __read_mostly;
> >>>  unsigned int default_hstate_idx;
> >>> @@ -3206,6 +3207,8 @@ void __init hugetlb_add_hstate(unsigned int order)
> >>>       snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
> >>>                                       huge_page_size(h)/1024);
> >>>
> >>> +     hugetlb_vmemmap_init(h);
> >>> +
> >>>       parsed_hstate = h;
> >>>  }
> >>>
> >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> >>> new file mode 100644
> >>> index 000000000000..51152e258f39
> >>> --- /dev/null
> >>> +++ b/mm/hugetlb_vmemmap.c
> >>> @@ -0,0 +1,129 @@
> >>> +// SPDX-License-Identifier: GPL-2.0
> >>> +/*
> >>> + * Free some vmemmap pages of HugeTLB
> >>> + *
> >>> + * Copyright (c) 2020, Bytedance. All rights reserved.
> >>> + *
> >>> + *     Author: Muchun Song <songmuchun@bytedance.com>
> >>> + *
> >>> + * The struct page structures (page structs) are used to describe a physical
> >>> + * page frame. By default, there is a one-to-one mapping from a page frame to
> >>> + * it's corresponding page struct.
> >>> + *
> >>> + * The HugeTLB pages consist of multiple base page size pages and is supported
> >>> + * by many architectures. See hugetlbpage.rst in the Documentation directory
> >>> + * for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB
> >>> + * are currently supported. Since the base page size on x86 is 4KB, a 2MB
> >>> + * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
> >>> + * 4096 base pages. For each base page, there is a corresponding page struct.
> >>> + *
> >>> + * Within the HugeTLB subsystem, only the first 4 page structs are used to
> >>> + * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
> >>> + * provides this upper limit. The only 'useful' information in the remaining
> >>> + * page structs is the compound_head field, and this field is the same for all
> >>> + * tail pages.
> >>> + *
> >>> + * By removing redundant page structs for HugeTLB pages, memory can returned to
> >>> + * the buddy allocator for other uses.
> >>> + *
> >>> + * When the system boot up, every 2M HugeTLB has 512 struct page structs which
> >>> + * size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).
> >>
> >>
> >> You should try to generalize all descriptions regarding differing base
> >> page sizes. E.g., arm64 supports 4k, 16k, and 64k base pages.
> >
> > Will do. Thanks.
> >
> >>
> >> [...]
> >>
> >>> @@ -0,0 +1,20 @@
> >>> +// SPDX-License-Identifier: GPL-2.0
> >>> +/*
> >>> + * Free some vmemmap pages of HugeTLB
> >>> + *
> >>> + * Copyright (c) 2020, Bytedance. All rights reserved.
> >>> + *
> >>> + *     Author: Muchun Song <songmuchun@bytedance.com>
> >>> + */
> >>> +#ifndef _LINUX_HUGETLB_VMEMMAP_H
> >>> +#define _LINUX_HUGETLB_VMEMMAP_H
> >>> +#include <linux/hugetlb.h>
> >>> +
> >>> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> >>> +void __init hugetlb_vmemmap_init(struct hstate *h);
> >>> +#else
> >>> +static inline void hugetlb_vmemmap_init(struct hstate *h)
> >>> +{
> >>> +}
> >>> +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> >>> +#endif /* _LINUX_HUGETLB_VMEMMAP_H */
> >>>
> >>
> >> This patch as it stands is rather sub-optimal. I mean, all it does is
> >> add documentation and print what could be done.
> >>
> >> Can we instead introduce the basic infrastructure and enable it via this
> >> patch on top, where we glue all the pieces together? Or is there
> >> something I am missing?
> >
> > Maybe we can make the config of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > default n in the Kconfig. When everything is ready, then make it
> > default to y. Right?
>
> I think it can make sense to introduce the
> CONFIG_HUGETLB_PAGE_FREE_VMEMMAP option first if necessary for other
> patches. But I think the the documentation and the dummy call should
> rather be moved to the end of the series where you glue everything you
> introduced together and officially unlock the feature. Others might
> disagree :)

I see. Thanks for your suggestions.

>
> BTW, I'm planning on reviewing the other parts of this series, I'm just
> fairly busy, so it might take a while (I think we're targeting 5.12
> either way as the 5.11 merge window will start fairly soon).
>

Very thanks.

> --
> Thanks,
>
> David / dhildenb
>
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ebca2ef02212..4efeccb7192c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -492,6 +492,9 @@  struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+	unsigned int nr_free_vmemmap_pages;
+#endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files_dfl[7];
diff --git a/mm/Makefile b/mm/Makefile
index ed4b88fa0f5e..056801d8daae 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -71,6 +71,7 @@  obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1f3bf1710b66..25f9e8e9fc4a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -42,6 +42,7 @@ 
 #include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
+#include "hugetlb_vmemmap.h"
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -3206,6 +3207,8 @@  void __init hugetlb_add_hstate(unsigned int order)
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
 
+	hugetlb_vmemmap_init(h);
+
 	parsed_hstate = h;
 }
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..51152e258f39
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,129 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ * The struct page structures (page structs) are used to describe a physical
+ * page frame. By default, there is a one-to-one mapping from a page frame to
+ * it's corresponding page struct.
+ *
+ * The HugeTLB pages consist of multiple base page size pages and is supported
+ * by many architectures. See hugetlbpage.rst in the Documentation directory
+ * for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB
+ * are currently supported. Since the base page size on x86 is 4KB, a 2MB
+ * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
+ * 4096 base pages. For each base page, there is a corresponding page struct.
+ *
+ * Within the HugeTLB subsystem, only the first 4 page structs are used to
+ * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
+ * provides this upper limit. The only 'useful' information in the remaining
+ * page structs is the compound_head field, and this field is the same for all
+ * tail pages.
+ *
+ * By removing redundant page structs for HugeTLB pages, memory can returned to
+ * the buddy allocator for other uses.
+ *
+ * When the system boot up, every 2M HugeTLB has 512 struct page structs which
+ * size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     2     | -------------> |     2     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     3     | -------------> |     3     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     4     | -------------> |     4     |
+ * |    2MB    |                     +-----------+                +-----------+
+ * |           |                     |     5     | -------------> |     5     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     6     | -------------> |     6     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     7     | -------------> |     7     |
+ * |           |                     +-----------+                +-----------+
+ * |           |
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ * The value of page->compound_head is the same for all tail pages. The first
+ * page of page structs (page 0) associated with the HugeTLB page contains the 4
+ * page structs necessary to describe the HugeTLB. The only use of the remaining
+ * pages of page structs (page 1 to page 7) is to point to page->compound_head.
+ * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * will be used for each HugeTLB page. This will allow us to free the remaining
+ * 6 pages to the buddy allocator.
+ *
+ * Here is how things look after remapping.
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ * |           |                     +-----------+                   | | | | |
+ * |           |                     |     3     | ------------------+ | | | |
+ * |           |                     +-----------+                     | | | |
+ * |           |                     |     4     | --------------------+ | | |
+ * |    2MB    |                     +-----------+                       | | |
+ * |           |                     |     5     | ----------------------+ | |
+ * |           |                     +-----------+                         | |
+ * |           |                     |     6     | ------------------------+ |
+ * |           |                     +-----------+                           |
+ * |           |                     |     7     | --------------------------+
+ * |           |                     +-----------+
+ * |           |
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * vmemmap pages and restore the previous mapping relationship.
+ *
+ * Apart from 2MB HugeTLB page, we also have 1GB HugeTLB page. It is similar
+ * to the 2MB HugeTLB page. We also can use this approach to free the vmemmap
+ * pages.
+ */
+#define pr_fmt(fmt)	"HugeTLB vmemmap: " fmt
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * There are a lot of struct page structures(8 page frames for 2MB HugeTLB page
+ * and 4096 page frames for 1GB HugeTLB page) associated with each HugeTLB page.
+ * For tail pages, the value of compound_head is the same. So we can reuse first
+ * page of tail page structures. We map the virtual addresses of the remaining
+ * pages of tail page structures to the first tail page struct, and then free
+ * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ */
+#define RESERVE_VMEMMAP_NR		2U
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+	unsigned int nr_pages = pages_per_huge_page(h);
+	unsigned int vmemmap_pages;
+
+	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
+	/*
+	 * The head page and the first tail page are not to be freed to buddy
+	 * system, the others page will map to the first tail page. So there
+	 * are the remaining pages that can be freed.
+	 *
+	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
+	 * on some architectures (e.g. aarch64). See Documentation/arm64/
+	 * hugetlbpage.rst for more details.
+	 */
+	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+	pr_debug("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+		 h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..40c0c7dfb60d
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,20 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@bytedance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void __init hugetlb_vmemmap_init(struct hstate *h);
+#else
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */