diff mbox series

[RFC,v1,3/4] mm: Override mTHP "enabled" defaults at kernel cmdline

Message ID 20240717071257.4141363-4-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series Control folio sizes used for page cache memory | expand

Commit Message

Ryan Roberts July 17, 2024, 7:12 a.m. UTC
Add thp_anon= cmdline parameter to allow specifying the default
enablement of each supported anon THP size. The parameter accepts the
following format and can be provided multiple times to configure each
size:

thp_anon=<size>[KMG]:<value>

See Documentation/admin-guide/mm/transhuge.rst for more details.

Configuring the defaults at boot time is useful to allow early user
space to take advantage of mTHP before its been configured through
sysfs.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 .../admin-guide/kernel-parameters.txt         |  8 +++
 Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
 mm/huge_memory.c                              | 55 ++++++++++++++++++-
 3 files changed, 82 insertions(+), 7 deletions(-)

Comments

Barry Song July 19, 2024, 12:46 a.m. UTC | #1
On Wed, Jul 17, 2024 at 7:13 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Add thp_anon= cmdline parameter to allow specifying the default
> enablement of each supported anon THP size. The parameter accepts the
> following format and can be provided multiple times to configure each
> size:
>
> thp_anon=<size>[KMG]:<value>
>
> See Documentation/admin-guide/mm/transhuge.rst for more details.
>
> Configuring the defaults at boot time is useful to allow early user
> space to take advantage of mTHP before its been configured through
> sysfs.

This is exactly what I need and want to implement, as the current behavior
is problematic. We need to boot up the system and reach the point where
we can set up the sys interfaces to enable mTHP. Many processes miss the
opportunity to use mTHP.

On the other hand, userspace might have been tuned to detect that mTHP
is enabled, such as a .so library. However, it turns out we have had
inconsistent settings between the two stages - before and after setting
mTHP enabled by sys interfaces.

>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |  8 +++
>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>  3 files changed, 82 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index bc55fb55cd26..48443ad12e3f 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6592,6 +6592,14 @@
>                         <deci-seconds>: poll all this frequency
>                         0: no polling (default)
>
> +       thp_anon=       [KNL]
> +                       Format: <size>[KMG]:always|madvise|never|inherit
> +                       Can be used to control the default behavior of the
> +                       system with respect to anonymous transparent hugepages.
> +                       Can be used multiple times for multiple anon THP sizes.
> +                       See Documentation/admin-guide/mm/transhuge.rst for more
> +                       details.
> +
>         threadirqs      [KNL,EARLY]
>                         Force threading of all interrupt handlers except those
>                         marked explicitly IRQF_NO_THREAD.
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 1aaf8e3a0b5a..f53d43d986e2 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -311,13 +311,27 @@ performance.
>  Note that any changes to the allowed set of sizes only applies to future
>  file-backed THP allocations.
>
> -Boot parameter
> -==============
> +Boot parameters
> +===============
>
> -You can change the sysfs boot time defaults of Transparent Hugepage
> -Support by passing the parameter ``transparent_hugepage=always`` or
> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
> -to the kernel command line.
> +You can change the sysfs boot time default for the top-level "enabled"
> +control by passing the parameter ``transparent_hugepage=always`` or
> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
> +kernel command line.
> +
> +Alternatively, each supported anonymous THP size can be controlled by
> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
> +``inherit``.
> +
> +For example, the following will set 64K THP to ``always``::
> +
> +       thp_anon=64K:always
> +
> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
> +not explicitly configured on the command line are implicitly set to
> +``never``.
>
>  Hugepages in tmpfs/shmem
>  ========================
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 4249c0bc9388..794d2790d90d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>  unsigned long huge_anon_orders_inherit __read_mostly;
>  unsigned long huge_file_orders_always __read_mostly;
>  int huge_file_exec_order __read_mostly = -1;
> +static bool anon_orders_configured;
>
>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>                                          unsigned long vm_flags,
> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>          * constant so we have to do this here.
>          */
> -       huge_anon_orders_inherit = BIT(PMD_ORDER);
> +       if (!anon_orders_configured) {
> +               huge_anon_orders_inherit = BIT(PMD_ORDER);
> +               anon_orders_configured = true;
> +       }
>
>         /*
>          * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>  }
>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>
> +static int __init setup_thp_anon(char *str)
> +{
> +       unsigned long size;
> +       char *state;
> +       int order;
> +       int ret = 0;
> +
> +       if (!str)
> +               goto out;
> +
> +       size = (unsigned long)memparse(str, &state);
> +       order = ilog2(size >> PAGE_SHIFT);
> +       if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
> +           !(BIT(order) & THP_ORDERS_ALL_ANON))
> +               goto out;
> +
> +       state++;
> +
> +       if (!strcmp(state, "always")) {
> +               clear_bit(order, &huge_anon_orders_inherit);
> +               clear_bit(order, &huge_anon_orders_madvise);
> +               set_bit(order, &huge_anon_orders_always);
> +               ret = 1;
> +       } else if (!strcmp(state, "inherit")) {
> +               clear_bit(order, &huge_anon_orders_always);
> +               clear_bit(order, &huge_anon_orders_madvise);
> +               set_bit(order, &huge_anon_orders_inherit);
> +               ret = 1;
> +       } else if (!strcmp(state, "madvise")) {
> +               clear_bit(order, &huge_anon_orders_always);
> +               clear_bit(order, &huge_anon_orders_inherit);
> +               set_bit(order, &huge_anon_orders_madvise);
> +               ret = 1;
> +       } else if (!strcmp(state, "never")) {
> +               clear_bit(order, &huge_anon_orders_always);
> +               clear_bit(order, &huge_anon_orders_inherit);
> +               clear_bit(order, &huge_anon_orders_madvise);
> +               ret = 1;
> +       }
> +
> +       if (ret)
> +               anon_orders_configured = true;
> +out:
> +       if (!ret)
> +               pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
> +       return ret;
> +}
> +__setup("thp_anon=", setup_thp_anon);
> +
>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>  {
>         if (likely(vma->vm_flags & VM_WRITE))
> --
> 2.43.0
>

Thanks
Barry
Ryan Roberts July 19, 2024, 7:47 a.m. UTC | #2
On 19/07/2024 01:46, Barry Song wrote:
> On Wed, Jul 17, 2024 at 7:13 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> Add thp_anon= cmdline parameter to allow specifying the default
>> enablement of each supported anon THP size. The parameter accepts the
>> following format and can be provided multiple times to configure each
>> size:
>>
>> thp_anon=<size>[KMG]:<value>
>>
>> See Documentation/admin-guide/mm/transhuge.rst for more details.
>>
>> Configuring the defaults at boot time is useful to allow early user
>> space to take advantage of mTHP before its been configured through
>> sysfs.
> 
> This is exactly what I need and want to implement, as the current behavior
> is problematic. We need to boot up the system and reach the point where
> we can set up the sys interfaces to enable mTHP. Many processes miss the
> opportunity to use mTHP.
> 
> On the other hand, userspace might have been tuned to detect that mTHP
> is enabled, such as a .so library. However, it turns out we have had
> inconsistent settings between the two stages - before and after setting
> mTHP enabled by sys interfaces.

Good feedback - sounds like I should separate out this patch from the rest of
the series to get it reviewed and merged faster?

> 
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>  .../admin-guide/kernel-parameters.txt         |  8 +++
>>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>>  3 files changed, 82 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index bc55fb55cd26..48443ad12e3f 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -6592,6 +6592,14 @@
>>                         <deci-seconds>: poll all this frequency
>>                         0: no polling (default)
>>
>> +       thp_anon=       [KNL]
>> +                       Format: <size>[KMG]:always|madvise|never|inherit
>> +                       Can be used to control the default behavior of the
>> +                       system with respect to anonymous transparent hugepages.
>> +                       Can be used multiple times for multiple anon THP sizes.
>> +                       See Documentation/admin-guide/mm/transhuge.rst for more
>> +                       details.
>> +
>>         threadirqs      [KNL,EARLY]
>>                         Force threading of all interrupt handlers except those
>>                         marked explicitly IRQF_NO_THREAD.
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 1aaf8e3a0b5a..f53d43d986e2 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -311,13 +311,27 @@ performance.
>>  Note that any changes to the allowed set of sizes only applies to future
>>  file-backed THP allocations.
>>
>> -Boot parameter
>> -==============
>> +Boot parameters
>> +===============
>>
>> -You can change the sysfs boot time defaults of Transparent Hugepage
>> -Support by passing the parameter ``transparent_hugepage=always`` or
>> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
>> -to the kernel command line.
>> +You can change the sysfs boot time default for the top-level "enabled"
>> +control by passing the parameter ``transparent_hugepage=always`` or
>> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
>> +kernel command line.
>> +
>> +Alternatively, each supported anonymous THP size can be controlled by
>> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
>> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
>> +``inherit``.
>> +
>> +For example, the following will set 64K THP to ``always``::
>> +
>> +       thp_anon=64K:always
>> +
>> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
>> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
>> +not explicitly configured on the command line are implicitly set to
>> +``never``.
>>
>>  Hugepages in tmpfs/shmem
>>  ========================
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 4249c0bc9388..794d2790d90d 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>>  unsigned long huge_anon_orders_inherit __read_mostly;
>>  unsigned long huge_file_orders_always __read_mostly;
>>  int huge_file_exec_order __read_mostly = -1;
>> +static bool anon_orders_configured;
>>
>>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>                                          unsigned long vm_flags,
>> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>>          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>>          * constant so we have to do this here.
>>          */
>> -       huge_anon_orders_inherit = BIT(PMD_ORDER);
>> +       if (!anon_orders_configured) {
>> +               huge_anon_orders_inherit = BIT(PMD_ORDER);
>> +               anon_orders_configured = true;
>> +       }
>>
>>         /*
>>          * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
>> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>>  }
>>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>>
>> +static int __init setup_thp_anon(char *str)
>> +{
>> +       unsigned long size;
>> +       char *state;
>> +       int order;
>> +       int ret = 0;
>> +
>> +       if (!str)
>> +               goto out;
>> +
>> +       size = (unsigned long)memparse(str, &state);
>> +       order = ilog2(size >> PAGE_SHIFT);
>> +       if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
>> +           !(BIT(order) & THP_ORDERS_ALL_ANON))
>> +               goto out;
>> +
>> +       state++;
>> +
>> +       if (!strcmp(state, "always")) {
>> +               clear_bit(order, &huge_anon_orders_inherit);
>> +               clear_bit(order, &huge_anon_orders_madvise);
>> +               set_bit(order, &huge_anon_orders_always);
>> +               ret = 1;
>> +       } else if (!strcmp(state, "inherit")) {
>> +               clear_bit(order, &huge_anon_orders_always);
>> +               clear_bit(order, &huge_anon_orders_madvise);
>> +               set_bit(order, &huge_anon_orders_inherit);
>> +               ret = 1;
>> +       } else if (!strcmp(state, "madvise")) {
>> +               clear_bit(order, &huge_anon_orders_always);
>> +               clear_bit(order, &huge_anon_orders_inherit);
>> +               set_bit(order, &huge_anon_orders_madvise);
>> +               ret = 1;
>> +       } else if (!strcmp(state, "never")) {
>> +               clear_bit(order, &huge_anon_orders_always);
>> +               clear_bit(order, &huge_anon_orders_inherit);
>> +               clear_bit(order, &huge_anon_orders_madvise);
>> +               ret = 1;
>> +       }
>> +
>> +       if (ret)
>> +               anon_orders_configured = true;
>> +out:
>> +       if (!ret)
>> +               pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
>> +       return ret;
>> +}
>> +__setup("thp_anon=", setup_thp_anon);
>> +
>>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>  {
>>         if (likely(vma->vm_flags & VM_WRITE))
>> --
>> 2.43.0
>>
> 
> Thanks
> Barry
Barry Song July 19, 2024, 7:52 a.m. UTC | #3
On Fri, Jul 19, 2024 at 7:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 19/07/2024 01:46, Barry Song wrote:
> > On Wed, Jul 17, 2024 at 7:13 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> Add thp_anon= cmdline parameter to allow specifying the default
> >> enablement of each supported anon THP size. The parameter accepts the
> >> following format and can be provided multiple times to configure each
> >> size:
> >>
> >> thp_anon=<size>[KMG]:<value>
> >>
> >> See Documentation/admin-guide/mm/transhuge.rst for more details.
> >>
> >> Configuring the defaults at boot time is useful to allow early user
> >> space to take advantage of mTHP before its been configured through
> >> sysfs.
> >
> > This is exactly what I need and want to implement, as the current behavior
> > is problematic. We need to boot up the system and reach the point where
> > we can set up the sys interfaces to enable mTHP. Many processes miss the
> > opportunity to use mTHP.
> >
> > On the other hand, userspace might have been tuned to detect that mTHP
> > is enabled, such as a .so library. However, it turns out we have had
> > inconsistent settings between the two stages - before and after setting
> > mTHP enabled by sys interfaces.
>
> Good feedback - sounds like I should separate out this patch from the rest of
> the series to get it reviewed and merged faster?

+1

>
> >
> >>
> >> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >> ---
> >>  .../admin-guide/kernel-parameters.txt         |  8 +++
> >>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
> >>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
> >>  3 files changed, 82 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> >> index bc55fb55cd26..48443ad12e3f 100644
> >> --- a/Documentation/admin-guide/kernel-parameters.txt
> >> +++ b/Documentation/admin-guide/kernel-parameters.txt
> >> @@ -6592,6 +6592,14 @@
> >>                         <deci-seconds>: poll all this frequency
> >>                         0: no polling (default)
> >>
> >> +       thp_anon=       [KNL]
> >> +                       Format: <size>[KMG]:always|madvise|never|inherit
> >> +                       Can be used to control the default behavior of the
> >> +                       system with respect to anonymous transparent hugepages.
> >> +                       Can be used multiple times for multiple anon THP sizes.
> >> +                       See Documentation/admin-guide/mm/transhuge.rst for more
> >> +                       details.
> >> +
> >>         threadirqs      [KNL,EARLY]
> >>                         Force threading of all interrupt handlers except those
> >>                         marked explicitly IRQF_NO_THREAD.
> >> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> >> index 1aaf8e3a0b5a..f53d43d986e2 100644
> >> --- a/Documentation/admin-guide/mm/transhuge.rst
> >> +++ b/Documentation/admin-guide/mm/transhuge.rst
> >> @@ -311,13 +311,27 @@ performance.
> >>  Note that any changes to the allowed set of sizes only applies to future
> >>  file-backed THP allocations.
> >>
> >> -Boot parameter
> >> -==============
> >> +Boot parameters
> >> +===============
> >>
> >> -You can change the sysfs boot time defaults of Transparent Hugepage
> >> -Support by passing the parameter ``transparent_hugepage=always`` or
> >> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
> >> -to the kernel command line.
> >> +You can change the sysfs boot time default for the top-level "enabled"
> >> +control by passing the parameter ``transparent_hugepage=always`` or
> >> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
> >> +kernel command line.
> >> +
> >> +Alternatively, each supported anonymous THP size can be controlled by
> >> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
> >> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
> >> +``inherit``.
> >> +
> >> +For example, the following will set 64K THP to ``always``::
> >> +
> >> +       thp_anon=64K:always
> >> +
> >> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
> >> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
> >> +not explicitly configured on the command line are implicitly set to
> >> +``never``.
> >>
> >>  Hugepages in tmpfs/shmem
> >>  ========================
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 4249c0bc9388..794d2790d90d 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
> >>  unsigned long huge_anon_orders_inherit __read_mostly;
> >>  unsigned long huge_file_orders_always __read_mostly;
> >>  int huge_file_exec_order __read_mostly = -1;
> >> +static bool anon_orders_configured;
> >>
> >>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>                                          unsigned long vm_flags,
> >> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
> >>          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
> >>          * constant so we have to do this here.
> >>          */
> >> -       huge_anon_orders_inherit = BIT(PMD_ORDER);
> >> +       if (!anon_orders_configured) {
> >> +               huge_anon_orders_inherit = BIT(PMD_ORDER);
> >> +               anon_orders_configured = true;
> >> +       }
> >>
> >>         /*
> >>          * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
> >> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
> >>  }
> >>  __setup("transparent_hugepage=", setup_transparent_hugepage);
> >>
> >> +static int __init setup_thp_anon(char *str)
> >> +{
> >> +       unsigned long size;
> >> +       char *state;
> >> +       int order;
> >> +       int ret = 0;
> >> +
> >> +       if (!str)
> >> +               goto out;
> >> +
> >> +       size = (unsigned long)memparse(str, &state);
> >> +       order = ilog2(size >> PAGE_SHIFT);
> >> +       if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
> >> +           !(BIT(order) & THP_ORDERS_ALL_ANON))
> >> +               goto out;
> >> +
> >> +       state++;
> >> +
> >> +       if (!strcmp(state, "always")) {
> >> +               clear_bit(order, &huge_anon_orders_inherit);
> >> +               clear_bit(order, &huge_anon_orders_madvise);
> >> +               set_bit(order, &huge_anon_orders_always);
> >> +               ret = 1;
> >> +       } else if (!strcmp(state, "inherit")) {
> >> +               clear_bit(order, &huge_anon_orders_always);
> >> +               clear_bit(order, &huge_anon_orders_madvise);
> >> +               set_bit(order, &huge_anon_orders_inherit);
> >> +               ret = 1;
> >> +       } else if (!strcmp(state, "madvise")) {
> >> +               clear_bit(order, &huge_anon_orders_always);
> >> +               clear_bit(order, &huge_anon_orders_inherit);
> >> +               set_bit(order, &huge_anon_orders_madvise);
> >> +               ret = 1;
> >> +       } else if (!strcmp(state, "never")) {
> >> +               clear_bit(order, &huge_anon_orders_always);
> >> +               clear_bit(order, &huge_anon_orders_inherit);
> >> +               clear_bit(order, &huge_anon_orders_madvise);
> >> +               ret = 1;
> >> +       }
> >> +
> >> +       if (ret)
> >> +               anon_orders_configured = true;
> >> +out:
> >> +       if (!ret)
> >> +               pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
> >> +       return ret;
> >> +}
> >> +__setup("thp_anon=", setup_thp_anon);
> >> +
> >>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
> >>  {
> >>         if (likely(vma->vm_flags & VM_WRITE))
> >> --
> >> 2.43.0
> >>
> >
> > Thanks
> > Barry
>
>
Ryan Roberts July 19, 2024, 8:18 a.m. UTC | #4
On 19/07/2024 08:52, Barry Song wrote:
> On Fri, Jul 19, 2024 at 7:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 19/07/2024 01:46, Barry Song wrote:
>>> On Wed, Jul 17, 2024 at 7:13 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> Add thp_anon= cmdline parameter to allow specifying the default
>>>> enablement of each supported anon THP size. The parameter accepts the
>>>> following format and can be provided multiple times to configure each
>>>> size:
>>>>
>>>> thp_anon=<size>[KMG]:<value>
>>>>
>>>> See Documentation/admin-guide/mm/transhuge.rst for more details.
>>>>
>>>> Configuring the defaults at boot time is useful to allow early user
>>>> space to take advantage of mTHP before its been configured through
>>>> sysfs.
>>>
>>> This is exactly what I need and want to implement, as the current behavior
>>> is problematic. We need to boot up the system and reach the point where
>>> we can set up the sys interfaces to enable mTHP. Many processes miss the
>>> opportunity to use mTHP.
>>>
>>> On the other hand, userspace might have been tuned to detect that mTHP
>>> is enabled, such as a .so library. However, it turns out we have had
>>> inconsistent settings between the two stages - before and after setting
>>> mTHP enabled by sys interfaces.
>>
>> Good feedback - sounds like I should separate out this patch from the rest of
>> the series to get it reviewed and merged faster?
> 
> +1

OK I'll wait a couple of days to see if anyone has any feedback against this
version, then I'll re-post this on its own.

> 
>>
>>>
>>>>
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>  .../admin-guide/kernel-parameters.txt         |  8 +++
>>>>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>>>>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>>>>  3 files changed, 82 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>>>> index bc55fb55cd26..48443ad12e3f 100644
>>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>>> @@ -6592,6 +6592,14 @@
>>>>                         <deci-seconds>: poll all this frequency
>>>>                         0: no polling (default)
>>>>
>>>> +       thp_anon=       [KNL]
>>>> +                       Format: <size>[KMG]:always|madvise|never|inherit
>>>> +                       Can be used to control the default behavior of the
>>>> +                       system with respect to anonymous transparent hugepages.
>>>> +                       Can be used multiple times for multiple anon THP sizes.
>>>> +                       See Documentation/admin-guide/mm/transhuge.rst for more
>>>> +                       details.
>>>> +
>>>>         threadirqs      [KNL,EARLY]
>>>>                         Force threading of all interrupt handlers except those
>>>>                         marked explicitly IRQF_NO_THREAD.
>>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>>>> index 1aaf8e3a0b5a..f53d43d986e2 100644
>>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>>> @@ -311,13 +311,27 @@ performance.
>>>>  Note that any changes to the allowed set of sizes only applies to future
>>>>  file-backed THP allocations.
>>>>
>>>> -Boot parameter
>>>> -==============
>>>> +Boot parameters
>>>> +===============
>>>>
>>>> -You can change the sysfs boot time defaults of Transparent Hugepage
>>>> -Support by passing the parameter ``transparent_hugepage=always`` or
>>>> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
>>>> -to the kernel command line.
>>>> +You can change the sysfs boot time default for the top-level "enabled"
>>>> +control by passing the parameter ``transparent_hugepage=always`` or
>>>> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
>>>> +kernel command line.
>>>> +
>>>> +Alternatively, each supported anonymous THP size can be controlled by
>>>> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
>>>> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
>>>> +``inherit``.
>>>> +
>>>> +For example, the following will set 64K THP to ``always``::
>>>> +
>>>> +       thp_anon=64K:always
>>>> +
>>>> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
>>>> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
>>>> +not explicitly configured on the command line are implicitly set to
>>>> +``never``.
>>>>
>>>>  Hugepages in tmpfs/shmem
>>>>  ========================
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 4249c0bc9388..794d2790d90d 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>>>>  unsigned long huge_anon_orders_inherit __read_mostly;
>>>>  unsigned long huge_file_orders_always __read_mostly;
>>>>  int huge_file_exec_order __read_mostly = -1;
>>>> +static bool anon_orders_configured;
>>>>
>>>>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>                                          unsigned long vm_flags,
>>>> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>>>>          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>>>>          * constant so we have to do this here.
>>>>          */
>>>> -       huge_anon_orders_inherit = BIT(PMD_ORDER);
>>>> +       if (!anon_orders_configured) {
>>>> +               huge_anon_orders_inherit = BIT(PMD_ORDER);
>>>> +               anon_orders_configured = true;
>>>> +       }
>>>>
>>>>         /*
>>>>          * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
>>>> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>>>>  }
>>>>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>>>>
>>>> +static int __init setup_thp_anon(char *str)
>>>> +{
>>>> +       unsigned long size;
>>>> +       char *state;
>>>> +       int order;
>>>> +       int ret = 0;
>>>> +
>>>> +       if (!str)
>>>> +               goto out;
>>>> +
>>>> +       size = (unsigned long)memparse(str, &state);
>>>> +       order = ilog2(size >> PAGE_SHIFT);
>>>> +       if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
>>>> +           !(BIT(order) & THP_ORDERS_ALL_ANON))
>>>> +               goto out;
>>>> +
>>>> +       state++;
>>>> +
>>>> +       if (!strcmp(state, "always")) {
>>>> +               clear_bit(order, &huge_anon_orders_inherit);
>>>> +               clear_bit(order, &huge_anon_orders_madvise);
>>>> +               set_bit(order, &huge_anon_orders_always);
>>>> +               ret = 1;
>>>> +       } else if (!strcmp(state, "inherit")) {
>>>> +               clear_bit(order, &huge_anon_orders_always);
>>>> +               clear_bit(order, &huge_anon_orders_madvise);
>>>> +               set_bit(order, &huge_anon_orders_inherit);
>>>> +               ret = 1;
>>>> +       } else if (!strcmp(state, "madvise")) {
>>>> +               clear_bit(order, &huge_anon_orders_always);
>>>> +               clear_bit(order, &huge_anon_orders_inherit);
>>>> +               set_bit(order, &huge_anon_orders_madvise);
>>>> +               ret = 1;
>>>> +       } else if (!strcmp(state, "never")) {
>>>> +               clear_bit(order, &huge_anon_orders_always);
>>>> +               clear_bit(order, &huge_anon_orders_inherit);
>>>> +               clear_bit(order, &huge_anon_orders_madvise);
>>>> +               ret = 1;
>>>> +       }
>>>> +
>>>> +       if (ret)
>>>> +               anon_orders_configured = true;
>>>> +out:
>>>> +       if (!ret)
>>>> +               pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
>>>> +       return ret;
>>>> +}
>>>> +__setup("thp_anon=", setup_thp_anon);
>>>> +
>>>>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>>>  {
>>>>         if (likely(vma->vm_flags & VM_WRITE))
>>>> --
>>>> 2.43.0
>>>>
>>>
>>> Thanks
>>> Barry
>>
>>
David Hildenbrand July 19, 2024, 8:29 a.m. UTC | #5
On 19.07.24 09:52, Barry Song wrote:
> On Fri, Jul 19, 2024 at 7:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 19/07/2024 01:46, Barry Song wrote:
>>> On Wed, Jul 17, 2024 at 7:13 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> Add thp_anon= cmdline parameter to allow specifying the default
>>>> enablement of each supported anon THP size. The parameter accepts the
>>>> following format and can be provided multiple times to configure each
>>>> size:
>>>>
>>>> thp_anon=<size>[KMG]:<value>
>>>>
>>>> See Documentation/admin-guide/mm/transhuge.rst for more details.
>>>>
>>>> Configuring the defaults at boot time is useful to allow early user
>>>> space to take advantage of mTHP before its been configured through
>>>> sysfs.
>>>
>>> This is exactly what I need and want to implement, as the current behavior
>>> is problematic. We need to boot up the system and reach the point where
>>> we can set up the sys interfaces to enable mTHP. Many processes miss the
>>> opportunity to use mTHP.
>>>
>>> On the other hand, userspace might have been tuned to detect that mTHP
>>> is enabled, such as a .so library. However, it turns out we have had
>>> inconsistent settings between the two stages - before and after setting
>>> mTHP enabled by sys interfaces.
>>
>> Good feedback - sounds like I should separate out this patch from the rest of
>> the series to get it reviewed and merged faster?
> 
> +1

Agreed, this is reasonable to have.
Daniel Gomez July 22, 2024, 9:13 a.m. UTC | #6
On Wed, Jul 17, 2024 at 08:12:55AM GMT, Ryan Roberts wrote:
> Add thp_anon= cmdline parameter to allow specifying the default
> enablement of each supported anon THP size. The parameter accepts the
> following format and can be provided multiple times to configure each
> size:
> 
> thp_anon=<size>[KMG]:<value>

Minor suggestion. Should this be renamed to hp_anon= or hugepages_anon= instead?
This would align with the values under /sys/kernel/mm/transparent_hugepage/
hugepages-*kB.

> 
> See Documentation/admin-guide/mm/transhuge.rst for more details.
> 
> Configuring the defaults at boot time is useful to allow early user
> space to take advantage of mTHP before its been configured through
> sysfs.
> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |  8 +++
>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>  3 files changed, 82 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index bc55fb55cd26..48443ad12e3f 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6592,6 +6592,14 @@
>  			<deci-seconds>: poll all this frequency
>  			0: no polling (default)
>  
> +	thp_anon=	[KNL]
> +			Format: <size>[KMG]:always|madvise|never|inherit
> +			Can be used to control the default behavior of the
> +			system with respect to anonymous transparent hugepages.
> +			Can be used multiple times for multiple anon THP sizes.
> +			See Documentation/admin-guide/mm/transhuge.rst for more
> +			details.
> +
>  	threadirqs	[KNL,EARLY]
>  			Force threading of all interrupt handlers except those
>  			marked explicitly IRQF_NO_THREAD.
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 1aaf8e3a0b5a..f53d43d986e2 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -311,13 +311,27 @@ performance.
>  Note that any changes to the allowed set of sizes only applies to future
>  file-backed THP allocations.
>  
> -Boot parameter
> -==============
> +Boot parameters
> +===============
>  
> -You can change the sysfs boot time defaults of Transparent Hugepage
> -Support by passing the parameter ``transparent_hugepage=always`` or
> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
> -to the kernel command line.
> +You can change the sysfs boot time default for the top-level "enabled"
> +control by passing the parameter ``transparent_hugepage=always`` or
> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
> +kernel command line.
> +
> +Alternatively, each supported anonymous THP size can be controlled by
> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
> +``inherit``.
> +
> +For example, the following will set 64K THP to ``always``::
> +
> +	thp_anon=64K:always
> +
> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
> +not explicitly configured on the command line are implicitly set to
> +``never``.
>  
>  Hugepages in tmpfs/shmem
>  ========================
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 4249c0bc9388..794d2790d90d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>  unsigned long huge_anon_orders_inherit __read_mostly;
>  unsigned long huge_file_orders_always __read_mostly;
>  int huge_file_exec_order __read_mostly = -1;
> +static bool anon_orders_configured;
>  
>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>  					 unsigned long vm_flags,
> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>  	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>  	 * constant so we have to do this here.
>  	 */
> -	huge_anon_orders_inherit = BIT(PMD_ORDER);
> +	if (!anon_orders_configured) {
> +		huge_anon_orders_inherit = BIT(PMD_ORDER);

PMD_ORDER for 64k base PS systems would result in a 512M value, which exceeds
the xarray limit [1]. Therefore, I think we need to avoid PMD-size orders by
checking if PMD_ORDER > MAX_PAGECACHE_ORDER.

[1] https://lore.kernel.org/all/20240627003953.1262512-1-gshan@redhat.com/

> +		anon_orders_configured = true;
> +	}
>  
>  	/*
>  	 * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>  }
>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>  
> +static int __init setup_thp_anon(char *str)
> +{
> +	unsigned long size;
> +	char *state;
> +	int order;
> +	int ret = 0;
> +
> +	if (!str)
> +		goto out;
> +
> +	size = (unsigned long)memparse(str, &state);
> +	order = ilog2(size >> PAGE_SHIFT);
> +	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
> +	    !(BIT(order) & THP_ORDERS_ALL_ANON))
> +		goto out;
> +
> +	state++;
> +
> +	if (!strcmp(state, "always")) {
> +		clear_bit(order, &huge_anon_orders_inherit);
> +		clear_bit(order, &huge_anon_orders_madvise);
> +		set_bit(order, &huge_anon_orders_always);
> +		ret = 1;
> +	} else if (!strcmp(state, "inherit")) {
> +		clear_bit(order, &huge_anon_orders_always);
> +		clear_bit(order, &huge_anon_orders_madvise);
> +		set_bit(order, &huge_anon_orders_inherit);
> +		ret = 1;
> +	} else if (!strcmp(state, "madvise")) {
> +		clear_bit(order, &huge_anon_orders_always);
> +		clear_bit(order, &huge_anon_orders_inherit);
> +		set_bit(order, &huge_anon_orders_madvise);
> +		ret = 1;
> +	} else if (!strcmp(state, "never")) {
> +		clear_bit(order, &huge_anon_orders_always);
> +		clear_bit(order, &huge_anon_orders_inherit);
> +		clear_bit(order, &huge_anon_orders_madvise);
> +		ret = 1;
> +	}
> +
> +	if (ret)
> +		anon_orders_configured = true;
> +out:
> +	if (!ret)
> +		pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
> +	return ret;
> +}
> +__setup("thp_anon=", setup_thp_anon);
> +
>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>  {
>  	if (likely(vma->vm_flags & VM_WRITE))
> -- 
> 2.43.0
>
Ryan Roberts July 22, 2024, 9:36 a.m. UTC | #7
On 22/07/2024 10:13, Daniel Gomez wrote:
> On Wed, Jul 17, 2024 at 08:12:55AM GMT, Ryan Roberts wrote:
>> Add thp_anon= cmdline parameter to allow specifying the default
>> enablement of each supported anon THP size. The parameter accepts the
>> following format and can be provided multiple times to configure each
>> size:
>>
>> thp_anon=<size>[KMG]:<value>
> 
> Minor suggestion. Should this be renamed to hp_anon= or hugepages_anon= instead?
> This would align with the values under /sys/kernel/mm/transparent_hugepage/
> hugepages-*kB.

"hp" doesn't feel right; that's not an abreviation we use today to my knowledge.
But I'd be happy to change it to "hugepages_anon", if that's the concensus.

> 
>>
>> See Documentation/admin-guide/mm/transhuge.rst for more details.
>>
>> Configuring the defaults at boot time is useful to allow early user
>> space to take advantage of mTHP before its been configured through
>> sysfs.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>  .../admin-guide/kernel-parameters.txt         |  8 +++
>>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>>  3 files changed, 82 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index bc55fb55cd26..48443ad12e3f 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -6592,6 +6592,14 @@
>>  			<deci-seconds>: poll all this frequency
>>  			0: no polling (default)
>>  
>> +	thp_anon=	[KNL]
>> +			Format: <size>[KMG]:always|madvise|never|inherit
>> +			Can be used to control the default behavior of the
>> +			system with respect to anonymous transparent hugepages.
>> +			Can be used multiple times for multiple anon THP sizes.
>> +			See Documentation/admin-guide/mm/transhuge.rst for more
>> +			details.
>> +
>>  	threadirqs	[KNL,EARLY]
>>  			Force threading of all interrupt handlers except those
>>  			marked explicitly IRQF_NO_THREAD.
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 1aaf8e3a0b5a..f53d43d986e2 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -311,13 +311,27 @@ performance.
>>  Note that any changes to the allowed set of sizes only applies to future
>>  file-backed THP allocations.
>>  
>> -Boot parameter
>> -==============
>> +Boot parameters
>> +===============
>>  
>> -You can change the sysfs boot time defaults of Transparent Hugepage
>> -Support by passing the parameter ``transparent_hugepage=always`` or
>> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
>> -to the kernel command line.
>> +You can change the sysfs boot time default for the top-level "enabled"
>> +control by passing the parameter ``transparent_hugepage=always`` or
>> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
>> +kernel command line.
>> +
>> +Alternatively, each supported anonymous THP size can be controlled by
>> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
>> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
>> +``inherit``.
>> +
>> +For example, the following will set 64K THP to ``always``::
>> +
>> +	thp_anon=64K:always
>> +
>> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
>> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
>> +not explicitly configured on the command line are implicitly set to
>> +``never``.
>>  
>>  Hugepages in tmpfs/shmem
>>  ========================
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 4249c0bc9388..794d2790d90d 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>>  unsigned long huge_anon_orders_inherit __read_mostly;
>>  unsigned long huge_file_orders_always __read_mostly;
>>  int huge_file_exec_order __read_mostly = -1;
>> +static bool anon_orders_configured;
>>  
>>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>  					 unsigned long vm_flags,
>> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>>  	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>>  	 * constant so we have to do this here.
>>  	 */
>> -	huge_anon_orders_inherit = BIT(PMD_ORDER);
>> +	if (!anon_orders_configured) {
>> +		huge_anon_orders_inherit = BIT(PMD_ORDER);
> 
> PMD_ORDER for 64k base PS systems would result in a 512M value, which exceeds
> the xarray limit [1]. Therefore, I think we need to avoid PMD-size orders by
> checking if PMD_ORDER > MAX_PAGECACHE_ORDER.

This is for anon memory, which isn't installed in the page cache so its
independent of MAX_PAGECACHE_ORDER. I don't believe there is a problem here.

> 
> [1] https://lore.kernel.org/all/20240627003953.1262512-1-gshan@redhat.com/
> 
>> +		anon_orders_configured = true;
>> +	}
>>  
>>  	/*
>>  	 * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
>> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>>  }
>>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>>  
>> +static int __init setup_thp_anon(char *str)
>> +{
>> +	unsigned long size;
>> +	char *state;
>> +	int order;
>> +	int ret = 0;
>> +
>> +	if (!str)
>> +		goto out;
>> +
>> +	size = (unsigned long)memparse(str, &state);
>> +	order = ilog2(size >> PAGE_SHIFT);
>> +	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
>> +	    !(BIT(order) & THP_ORDERS_ALL_ANON))
>> +		goto out;
>> +
>> +	state++;
>> +
>> +	if (!strcmp(state, "always")) {
>> +		clear_bit(order, &huge_anon_orders_inherit);
>> +		clear_bit(order, &huge_anon_orders_madvise);
>> +		set_bit(order, &huge_anon_orders_always);
>> +		ret = 1;
>> +	} else if (!strcmp(state, "inherit")) {
>> +		clear_bit(order, &huge_anon_orders_always);
>> +		clear_bit(order, &huge_anon_orders_madvise);
>> +		set_bit(order, &huge_anon_orders_inherit);
>> +		ret = 1;
>> +	} else if (!strcmp(state, "madvise")) {
>> +		clear_bit(order, &huge_anon_orders_always);
>> +		clear_bit(order, &huge_anon_orders_inherit);
>> +		set_bit(order, &huge_anon_orders_madvise);
>> +		ret = 1;
>> +	} else if (!strcmp(state, "never")) {
>> +		clear_bit(order, &huge_anon_orders_always);
>> +		clear_bit(order, &huge_anon_orders_inherit);
>> +		clear_bit(order, &huge_anon_orders_madvise);
>> +		ret = 1;
>> +	}
>> +
>> +	if (ret)
>> +		anon_orders_configured = true;
>> +out:
>> +	if (!ret)
>> +		pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
>> +	return ret;
>> +}
>> +__setup("thp_anon=", setup_thp_anon);
>> +
>>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>  {
>>  	if (likely(vma->vm_flags & VM_WRITE))
>> -- 
>> 2.43.0
Ryan Roberts July 22, 2024, 2:10 p.m. UTC | #8
On 22/07/2024 10:36, Ryan Roberts wrote:
> On 22/07/2024 10:13, Daniel Gomez wrote:
>> On Wed, Jul 17, 2024 at 08:12:55AM GMT, Ryan Roberts wrote:
>>> Add thp_anon= cmdline parameter to allow specifying the default
>>> enablement of each supported anon THP size. The parameter accepts the
>>> following format and can be provided multiple times to configure each
>>> size:
>>>
>>> thp_anon=<size>[KMG]:<value>
>>
>> Minor suggestion. Should this be renamed to hp_anon= or hugepages_anon= instead?
>> This would align with the values under /sys/kernel/mm/transparent_hugepage/
>> hugepages-*kB.
> 
> "hp" doesn't feel right; that's not an abreviation we use today to my knowledge.
> But I'd be happy to change it to "hugepages_anon", if that's the concensus.

Thinking about this a bit more, "hugepages=" is already a cmdline parameter used
to reserve hugepages for use with HugeTLB. So I think that could get confusing.

transparent_hugepage= is the existing cmdline parameter for the top-level (anon)
control. I considered "transparent_hugepage_anon=" or even just extending to use
the same parameter for both the top level and the per-size controls (with
optional size):

  transparent_hugepage=[<size>[KMG]:]<value>

But given they likely need to be provided multiple times, both of those options
seem too long. Which is how I settled on thp_anon= (and in the next patch,
thp_file=).

> 
>>
>>>
>>> See Documentation/admin-guide/mm/transhuge.rst for more details.
>>>
>>> Configuring the defaults at boot time is useful to allow early user
>>> space to take advantage of mTHP before its been configured through
>>> sysfs.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>  .../admin-guide/kernel-parameters.txt         |  8 +++
>>>  Documentation/admin-guide/mm/transhuge.rst    | 26 +++++++--
>>>  mm/huge_memory.c                              | 55 ++++++++++++++++++-
>>>  3 files changed, 82 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>>> index bc55fb55cd26..48443ad12e3f 100644
>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>> @@ -6592,6 +6592,14 @@
>>>  			<deci-seconds>: poll all this frequency
>>>  			0: no polling (default)
>>>  
>>> +	thp_anon=	[KNL]
>>> +			Format: <size>[KMG]:always|madvise|never|inherit
>>> +			Can be used to control the default behavior of the
>>> +			system with respect to anonymous transparent hugepages.
>>> +			Can be used multiple times for multiple anon THP sizes.
>>> +			See Documentation/admin-guide/mm/transhuge.rst for more
>>> +			details.
>>> +
>>>  	threadirqs	[KNL,EARLY]
>>>  			Force threading of all interrupt handlers except those
>>>  			marked explicitly IRQF_NO_THREAD.
>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>>> index 1aaf8e3a0b5a..f53d43d986e2 100644
>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>> @@ -311,13 +311,27 @@ performance.
>>>  Note that any changes to the allowed set of sizes only applies to future
>>>  file-backed THP allocations.
>>>  
>>> -Boot parameter
>>> -==============
>>> +Boot parameters
>>> +===============
>>>  
>>> -You can change the sysfs boot time defaults of Transparent Hugepage
>>> -Support by passing the parameter ``transparent_hugepage=always`` or
>>> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
>>> -to the kernel command line.
>>> +You can change the sysfs boot time default for the top-level "enabled"
>>> +control by passing the parameter ``transparent_hugepage=always`` or
>>> +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
>>> +kernel command line.
>>> +
>>> +Alternatively, each supported anonymous THP size can be controlled by
>>> +passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
>>> +and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
>>> +``inherit``.
>>> +
>>> +For example, the following will set 64K THP to ``always``::
>>> +
>>> +	thp_anon=64K:always
>>> +
>>> +``thp_anon=`` may be specified multiple times to configure all THP sizes as
>>> +required. If ``thp_anon=`` is specified at least once, any anon THP sizes
>>> +not explicitly configured on the command line are implicitly set to
>>> +``never``.
>>>  
>>>  Hugepages in tmpfs/shmem
>>>  ========================
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 4249c0bc9388..794d2790d90d 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -82,6 +82,7 @@ unsigned long huge_anon_orders_madvise __read_mostly;
>>>  unsigned long huge_anon_orders_inherit __read_mostly;
>>>  unsigned long huge_file_orders_always __read_mostly;
>>>  int huge_file_exec_order __read_mostly = -1;
>>> +static bool anon_orders_configured;
>>>  
>>>  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>  					 unsigned long vm_flags,
>>> @@ -763,7 +764,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
>>>  	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
>>>  	 * constant so we have to do this here.
>>>  	 */
>>> -	huge_anon_orders_inherit = BIT(PMD_ORDER);
>>> +	if (!anon_orders_configured) {
>>> +		huge_anon_orders_inherit = BIT(PMD_ORDER);
>>
>> PMD_ORDER for 64k base PS systems would result in a 512M value, which exceeds
>> the xarray limit [1]. Therefore, I think we need to avoid PMD-size orders by
>> checking if PMD_ORDER > MAX_PAGECACHE_ORDER.
> 
> This is for anon memory, which isn't installed in the page cache so its
> independent of MAX_PAGECACHE_ORDER. I don't believe there is a problem here.
> 
>>
>> [1] https://lore.kernel.org/all/20240627003953.1262512-1-gshan@redhat.com/
>>
>>> +		anon_orders_configured = true;
>>> +	}
>>>  
>>>  	/*
>>>  	 * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
>>> @@ -955,6 +959,55 @@ static int __init setup_transparent_hugepage(char *str)
>>>  }
>>>  __setup("transparent_hugepage=", setup_transparent_hugepage);
>>>  
>>> +static int __init setup_thp_anon(char *str)
>>> +{
>>> +	unsigned long size;
>>> +	char *state;
>>> +	int order;
>>> +	int ret = 0;
>>> +
>>> +	if (!str)
>>> +		goto out;
>>> +
>>> +	size = (unsigned long)memparse(str, &state);
>>> +	order = ilog2(size >> PAGE_SHIFT);
>>> +	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
>>> +	    !(BIT(order) & THP_ORDERS_ALL_ANON))
>>> +		goto out;
>>> +
>>> +	state++;
>>> +
>>> +	if (!strcmp(state, "always")) {
>>> +		clear_bit(order, &huge_anon_orders_inherit);
>>> +		clear_bit(order, &huge_anon_orders_madvise);
>>> +		set_bit(order, &huge_anon_orders_always);
>>> +		ret = 1;
>>> +	} else if (!strcmp(state, "inherit")) {
>>> +		clear_bit(order, &huge_anon_orders_always);
>>> +		clear_bit(order, &huge_anon_orders_madvise);
>>> +		set_bit(order, &huge_anon_orders_inherit);
>>> +		ret = 1;
>>> +	} else if (!strcmp(state, "madvise")) {
>>> +		clear_bit(order, &huge_anon_orders_always);
>>> +		clear_bit(order, &huge_anon_orders_inherit);
>>> +		set_bit(order, &huge_anon_orders_madvise);
>>> +		ret = 1;
>>> +	} else if (!strcmp(state, "never")) {
>>> +		clear_bit(order, &huge_anon_orders_always);
>>> +		clear_bit(order, &huge_anon_orders_inherit);
>>> +		clear_bit(order, &huge_anon_orders_madvise);
>>> +		ret = 1;
>>> +	}
>>> +
>>> +	if (ret)
>>> +		anon_orders_configured = true;
>>> +out:
>>> +	if (!ret)
>>> +		pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
>>> +	return ret;
>>> +}
>>> +__setup("thp_anon=", setup_thp_anon);
>>> +
>>>  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>>  {
>>>  	if (likely(vma->vm_flags & VM_WRITE))
>>> -- 
>>> 2.43.0
>
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bc55fb55cd26..48443ad12e3f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6592,6 +6592,14 @@ 
 			<deci-seconds>: poll all this frequency
 			0: no polling (default)
 
+	thp_anon=	[KNL]
+			Format: <size>[KMG]:always|madvise|never|inherit
+			Can be used to control the default behavior of the
+			system with respect to anonymous transparent hugepages.
+			Can be used multiple times for multiple anon THP sizes.
+			See Documentation/admin-guide/mm/transhuge.rst for more
+			details.
+
 	threadirqs	[KNL,EARLY]
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 1aaf8e3a0b5a..f53d43d986e2 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -311,13 +311,27 @@  performance.
 Note that any changes to the allowed set of sizes only applies to future
 file-backed THP allocations.
 
-Boot parameter
-==============
+Boot parameters
+===============
 
-You can change the sysfs boot time defaults of Transparent Hugepage
-Support by passing the parameter ``transparent_hugepage=always`` or
-``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
-to the kernel command line.
+You can change the sysfs boot time default for the top-level "enabled"
+control by passing the parameter ``transparent_hugepage=always`` or
+``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
+kernel command line.
+
+Alternatively, each supported anonymous THP size can be controlled by
+passing ``thp_anon=<size>[KMG]:<state>``, where ``<size>`` is the THP size
+and ``<state>`` is one of ``always``, ``madvise``, ``never`` or
+``inherit``.
+
+For example, the following will set 64K THP to ``always``::
+
+	thp_anon=64K:always
+
+``thp_anon=`` may be specified multiple times to configure all THP sizes as
+required. If ``thp_anon=`` is specified at least once, any anon THP sizes
+not explicitly configured on the command line are implicitly set to
+``never``.
 
 Hugepages in tmpfs/shmem
 ========================
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4249c0bc9388..794d2790d90d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -82,6 +82,7 @@  unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
 unsigned long huge_file_orders_always __read_mostly;
 int huge_file_exec_order __read_mostly = -1;
+static bool anon_orders_configured;
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
@@ -763,7 +764,10 @@  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
 	 * constant so we have to do this here.
 	 */
-	huge_anon_orders_inherit = BIT(PMD_ORDER);
+	if (!anon_orders_configured) {
+		huge_anon_orders_inherit = BIT(PMD_ORDER);
+		anon_orders_configured = true;
+	}
 
 	/*
 	 * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
@@ -955,6 +959,55 @@  static int __init setup_transparent_hugepage(char *str)
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
+static int __init setup_thp_anon(char *str)
+{
+	unsigned long size;
+	char *state;
+	int order;
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	size = (unsigned long)memparse(str, &state);
+	order = ilog2(size >> PAGE_SHIFT);
+	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
+	    !(BIT(order) & THP_ORDERS_ALL_ANON))
+		goto out;
+
+	state++;
+
+	if (!strcmp(state, "always")) {
+		clear_bit(order, &huge_anon_orders_inherit);
+		clear_bit(order, &huge_anon_orders_madvise);
+		set_bit(order, &huge_anon_orders_always);
+		ret = 1;
+	} else if (!strcmp(state, "inherit")) {
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_madvise);
+		set_bit(order, &huge_anon_orders_inherit);
+		ret = 1;
+	} else if (!strcmp(state, "madvise")) {
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_inherit);
+		set_bit(order, &huge_anon_orders_madvise);
+		ret = 1;
+	} else if (!strcmp(state, "never")) {
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_inherit);
+		clear_bit(order, &huge_anon_orders_madvise);
+		ret = 1;
+	}
+
+	if (ret)
+		anon_orders_configured = true;
+out:
+	if (!ret)
+		pr_warn("thp_anon=%s: cannot parse, ignored\n", str);
+	return ret;
+}
+__setup("thp_anon=", setup_thp_anon);
+
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))