diff mbox series

[v3] mm: add per-order mTHP alloc_success and alloc_fail counters

Message ID 20240403035502.71356-1-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series [v3] mm: add per-order mTHP alloc_success and alloc_fail counters | expand

Commit Message

Barry Song April 3, 2024, 3:55 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

Profiling a system blindly with mTHP has become challenging due
to the lack of visibility into its operations. Presenting the
success rate of mTHP allocations appears to be pressing need.

Recently, I've been experiencing significant difficulty debugging
performance improvements and regressions without these figures.
It's crucial for us to understand the true effectiveness of
mTHP in real-world scenarios, especially in systems with
fragmented memory.

This patch sets up the framework for per-order mTHP counters,
starting with the introduction of anon_alloc_success and
anon_alloc_fail counters.  Incorporating additional counters
should now be straightforward as well.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 -v3:
 * save some memory as order-0 and order-1 can't be THP, Ryan;
 * rename to anon_alloc as right now we only support anon to address
   David's comment;
 * drop a redundant "else", Ryan

 include/linux/huge_mm.h | 18 ++++++++++++++
 mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  2 ++
 3 files changed, 74 insertions(+)

Comments

David Hildenbrand April 3, 2024, 8:22 a.m. UTC | #1
On 03.04.24 05:55, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Profiling a system blindly with mTHP has become challenging due
> to the lack of visibility into its operations. Presenting the
> success rate of mTHP allocations appears to be pressing need.
> 
> Recently, I've been experiencing significant difficulty debugging
> performance improvements and regressions without these figures.
> It's crucial for us to understand the true effectiveness of
> mTHP in real-world scenarios, especially in systems with
> fragmented memory.
> 
> This patch sets up the framework for per-order mTHP counters,
> starting with the introduction of anon_alloc_success and
> anon_alloc_fail counters.  Incorporating additional counters
> should now be straightforward as well.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>   -v3:
>   * save some memory as order-0 and order-1 can't be THP, Ryan;
>   * rename to anon_alloc as right now we only support anon to address
>     David's comment;
>   * drop a redundant "else", Ryan
> 
>   include/linux/huge_mm.h | 18 ++++++++++++++
>   mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>   mm/memory.c             |  2 ++
>   3 files changed, 74 insertions(+)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e896ca4760f6..5e9af6be9537 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>    * (which is a limitation of the THP implementation).
>    */
>   #define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> +#define THP_MIN_ORDER		2
>   
>   /*
>    * Mask of all large folio orders supported for file THP.
> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>   					  enforce_sysfs, orders);
>   }
>   
> +enum thp_event_item {
> +	THP_ANON_ALLOC_SUCCESS,
> +	THP_ANON_ALLOC_FAIL,
> +	NR_THP_EVENT_ITEMS
> +};

Maybe use a prefix that resembles matches the enum name and is 
"obviously" different to the ones in vm_event_item.h, like

enum thp_event {
	THP_EVENT_ANON_ALLOC_SUCCESS,
	THP_EVENT_ANON_ALLOC_FAIL,
	__THP_EVENT_COUNT,
};
Ryan Roberts April 3, 2024, 11:48 a.m. UTC | #2
On 03/04/2024 09:22, David Hildenbrand wrote:
> On 03.04.24 05:55, Barry Song wrote:
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> Profiling a system blindly with mTHP has become challenging due
>> to the lack of visibility into its operations. Presenting the
>> success rate of mTHP allocations appears to be pressing need.
>>
>> Recently, I've been experiencing significant difficulty debugging
>> performance improvements and regressions without these figures.
>> It's crucial for us to understand the true effectiveness of
>> mTHP in real-world scenarios, especially in systems with
>> fragmented memory.
>>
>> This patch sets up the framework for per-order mTHP counters,
>> starting with the introduction of anon_alloc_success and
>> anon_alloc_fail counters.  Incorporating additional counters
>> should now be straightforward as well.
>>
>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> ---
>>   -v3:
>>   * save some memory as order-0 and order-1 can't be THP, Ryan;
>>   * rename to anon_alloc as right now we only support anon to address
>>     David's comment;
>>   * drop a redundant "else", Ryan
>>
>>   include/linux/huge_mm.h | 18 ++++++++++++++
>>   mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>   mm/memory.c             |  2 ++
>>   3 files changed, 74 insertions(+)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e896ca4760f6..5e9af6be9537 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>    * (which is a limitation of the THP implementation).
>>    */
>>   #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>> +#define THP_MIN_ORDER        2
>>     /*
>>    * Mask of all large folio orders supported for file THP.
>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>> vm_area_struct *vma,
>>                         enforce_sysfs, orders);
>>   }
>>   +enum thp_event_item {
>> +    THP_ANON_ALLOC_SUCCESS,
>> +    THP_ANON_ALLOC_FAIL,
>> +    NR_THP_EVENT_ITEMS
>> +};
> 
> Maybe use a prefix that resembles matches the enum name and is "obviously"
> different to the ones in vm_event_item.h, like
> 
> enum thp_event {
>     THP_EVENT_ANON_ALLOC_SUCCESS,
>     THP_EVENT_ANON_ALLOC_FAIL,
>     __THP_EVENT_COUNT,
> };

FWIW, I'd personally replace "event" with "stat". For me "event" only ever
increments, but "stat" can increment and decrement. An event is a type of stat.

You are only adding events for now, but we have identified a need for inc/dec
stats that will be added in future.
David Hildenbrand April 3, 2024, noon UTC | #3
On 03.04.24 13:48, Ryan Roberts wrote:
> On 03/04/2024 09:22, David Hildenbrand wrote:
>> On 03.04.24 05:55, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> Profiling a system blindly with mTHP has become challenging due
>>> to the lack of visibility into its operations. Presenting the
>>> success rate of mTHP allocations appears to be pressing need.
>>>
>>> Recently, I've been experiencing significant difficulty debugging
>>> performance improvements and regressions without these figures.
>>> It's crucial for us to understand the true effectiveness of
>>> mTHP in real-world scenarios, especially in systems with
>>> fragmented memory.
>>>
>>> This patch sets up the framework for per-order mTHP counters,
>>> starting with the introduction of anon_alloc_success and
>>> anon_alloc_fail counters.  Incorporating additional counters
>>> should now be straightforward as well.
>>>
>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>> ---
>>>    -v3:
>>>    * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>    * rename to anon_alloc as right now we only support anon to address
>>>      David's comment;
>>>    * drop a redundant "else", Ryan
>>>
>>>    include/linux/huge_mm.h | 18 ++++++++++++++
>>>    mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>    mm/memory.c             |  2 ++
>>>    3 files changed, 74 insertions(+)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e896ca4760f6..5e9af6be9537 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>     * (which is a limitation of the THP implementation).
>>>     */
>>>    #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>> +#define THP_MIN_ORDER        2
>>>      /*
>>>     * Mask of all large folio orders supported for file THP.
>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>> vm_area_struct *vma,
>>>                          enforce_sysfs, orders);
>>>    }
>>>    +enum thp_event_item {
>>> +    THP_ANON_ALLOC_SUCCESS,
>>> +    THP_ANON_ALLOC_FAIL,
>>> +    NR_THP_EVENT_ITEMS
>>> +};
>>
>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>> different to the ones in vm_event_item.h, like
>>
>> enum thp_event {
>>      THP_EVENT_ANON_ALLOC_SUCCESS,
>>      THP_EVENT_ANON_ALLOC_FAIL,
>>      __THP_EVENT_COUNT,
>> };
> 
> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> increments, but "stat" can increment and decrement. An event is a type of stat.
> 
> You are only adding events for now, but we have identified a need for inc/dec
> stats that will be added in future.
> 

Fully agree!
Barry Song April 3, 2024, 9 p.m. UTC | #4
On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 03/04/2024 09:22, David Hildenbrand wrote:
> > On 03.04.24 05:55, Barry Song wrote:
> >> From: Barry Song <v-songbaohua@oppo.com>
> >>
> >> Profiling a system blindly with mTHP has become challenging due
> >> to the lack of visibility into its operations. Presenting the
> >> success rate of mTHP allocations appears to be pressing need.
> >>
> >> Recently, I've been experiencing significant difficulty debugging
> >> performance improvements and regressions without these figures.
> >> It's crucial for us to understand the true effectiveness of
> >> mTHP in real-world scenarios, especially in systems with
> >> fragmented memory.
> >>
> >> This patch sets up the framework for per-order mTHP counters,
> >> starting with the introduction of anon_alloc_success and
> >> anon_alloc_fail counters.  Incorporating additional counters
> >> should now be straightforward as well.
> >>
> >> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> ---
> >>   -v3:
> >>   * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>   * rename to anon_alloc as right now we only support anon to address
> >>     David's comment;
> >>   * drop a redundant "else", Ryan
> >>
> >>   include/linux/huge_mm.h | 18 ++++++++++++++
> >>   mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>   mm/memory.c             |  2 ++
> >>   3 files changed, 74 insertions(+)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index e896ca4760f6..5e9af6be9537 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>    * (which is a limitation of the THP implementation).
> >>    */
> >>   #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >> +#define THP_MIN_ORDER        2
> >>     /*
> >>    * Mask of all large folio orders supported for file THP.
> >> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >> vm_area_struct *vma,
> >>                         enforce_sysfs, orders);
> >>   }
> >>   +enum thp_event_item {
> >> +    THP_ANON_ALLOC_SUCCESS,
> >> +    THP_ANON_ALLOC_FAIL,
> >> +    NR_THP_EVENT_ITEMS
> >> +};
> >
> > Maybe use a prefix that resembles matches the enum name and is "obviously"
> > different to the ones in vm_event_item.h, like
> >
> > enum thp_event {
> >     THP_EVENT_ANON_ALLOC_SUCCESS,
> >     THP_EVENT_ANON_ALLOC_FAIL,
> >     __THP_EVENT_COUNT,
> > };
>
> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> increments, but "stat" can increment and decrement. An event is a type of stat.
>
> You are only adding events for now, but we have identified a need for inc/dec
> stats that will be added in future.

What about the below?

enum thp_stat {
   THP_EVENT_ANON_ALLOC,
   THP_EVENT_ANON_ALLOC_FALLBACK,
   THP_EVENT_SWPOUT,
   THP_EVENT_SWPOUT_FALLBACK,
   ...
   THP_NR_ANON_PAGES,
   THP_NR_FILE_PAGES,
   ...
   __THP_STAT_COUNT,
};

THP_EVENT can only increase; THP_NR_ANON/FILE_PAGES can increase or
decrease. Thus, their names have no "EVENT".

>
Ryan Roberts April 4, 2024, 7:21 a.m. UTC | #5
On 03/04/2024 22:00, Barry Song wrote:
> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>> On 03.04.24 05:55, Barry Song wrote:
>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>
>>>> Profiling a system blindly with mTHP has become challenging due
>>>> to the lack of visibility into its operations. Presenting the
>>>> success rate of mTHP allocations appears to be pressing need.
>>>>
>>>> Recently, I've been experiencing significant difficulty debugging
>>>> performance improvements and regressions without these figures.
>>>> It's crucial for us to understand the true effectiveness of
>>>> mTHP in real-world scenarios, especially in systems with
>>>> fragmented memory.
>>>>
>>>> This patch sets up the framework for per-order mTHP counters,
>>>> starting with the introduction of anon_alloc_success and
>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>> should now be straightforward as well.
>>>>
>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>> ---
>>>>   -v3:
>>>>   * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>   * rename to anon_alloc as right now we only support anon to address
>>>>     David's comment;
>>>>   * drop a redundant "else", Ryan
>>>>
>>>>   include/linux/huge_mm.h | 18 ++++++++++++++
>>>>   mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>   mm/memory.c             |  2 ++
>>>>   3 files changed, 74 insertions(+)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index e896ca4760f6..5e9af6be9537 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>    * (which is a limitation of the THP implementation).
>>>>    */
>>>>   #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>> +#define THP_MIN_ORDER        2
>>>>     /*
>>>>    * Mask of all large folio orders supported for file THP.
>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>> vm_area_struct *vma,
>>>>                         enforce_sysfs, orders);
>>>>   }
>>>>   +enum thp_event_item {
>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>> +    THP_ANON_ALLOC_FAIL,
>>>> +    NR_THP_EVENT_ITEMS
>>>> +};
>>>
>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>> different to the ones in vm_event_item.h, like
>>>
>>> enum thp_event {
>>>     THP_EVENT_ANON_ALLOC_SUCCESS,
>>>     THP_EVENT_ANON_ALLOC_FAIL,
>>>     __THP_EVENT_COUNT,
>>> };
>>
>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>> increments, but "stat" can increment and decrement. An event is a type of stat.
>>
>> You are only adding events for now, but we have identified a need for inc/dec
>> stats that will be added in future.
> 
> What about the below?
> 
> enum thp_stat {
>    THP_EVENT_ANON_ALLOC,
>    THP_EVENT_ANON_ALLOC_FALLBACK,
>    THP_EVENT_SWPOUT,
>    THP_EVENT_SWPOUT_FALLBACK,
>    ...
>    THP_NR_ANON_PAGES,
>    THP_NR_FILE_PAGES,

I find this ambiguous; Is it the number of THPs or the number of base pages?

I think David made the point about incorporating the enum name into the labels
too, so that there can be no namespace confusion. How about:

<enum>_<type>_<name>

So:

THP_STAT_EV_ANON_ALLOC
THP_STAT_EV_ANON_ALLOC_FALLBACK
THP_STAT_EV_ANON_PARTIAL
THP_STAT_EV_SWPOUT
THP_STAT_EV_SWPOUT_FALLBACK
...
THP_STAT_NR_ANON
THP_STAT_NR_FILE
...
__THP_STAT_COUNT

I'm not sure if we need to prefix SWPOUT with ANON? (might we additionally want
SHMEM in future?

I'm aware I'm bikeshedding at this point... sorry :)


>    ...
>    __THP_STAT_COUNT,
> };
> 
> THP_EVENT can only increase; THP_NR_ANON/FILE_PAGES can increase or
> decrease. Thus, their names have no "EVENT".
> 
>>
Barry Song April 4, 2024, 10:52 a.m. UTC | #6
On Thu, Apr 4, 2024 at 8:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 03/04/2024 22:00, Barry Song wrote:
> > On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 03/04/2024 09:22, David Hildenbrand wrote:
> >>> On 03.04.24 05:55, Barry Song wrote:
> >>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>
> >>>> Profiling a system blindly with mTHP has become challenging due
> >>>> to the lack of visibility into its operations. Presenting the
> >>>> success rate of mTHP allocations appears to be pressing need.
> >>>>
> >>>> Recently, I've been experiencing significant difficulty debugging
> >>>> performance improvements and regressions without these figures.
> >>>> It's crucial for us to understand the true effectiveness of
> >>>> mTHP in real-world scenarios, especially in systems with
> >>>> fragmented memory.
> >>>>
> >>>> This patch sets up the framework for per-order mTHP counters,
> >>>> starting with the introduction of anon_alloc_success and
> >>>> anon_alloc_fail counters.  Incorporating additional counters
> >>>> should now be straightforward as well.
> >>>>
> >>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>> ---
> >>>>   -v3:
> >>>>   * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>>>   * rename to anon_alloc as right now we only support anon to address
> >>>>     David's comment;
> >>>>   * drop a redundant "else", Ryan
> >>>>
> >>>>   include/linux/huge_mm.h | 18 ++++++++++++++
> >>>>   mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>>>   mm/memory.c             |  2 ++
> >>>>   3 files changed, 74 insertions(+)
> >>>>
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index e896ca4760f6..5e9af6be9537 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>    * (which is a limitation of the THP implementation).
> >>>>    */
> >>>>   #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >>>> +#define THP_MIN_ORDER        2
> >>>>     /*
> >>>>    * Mask of all large folio orders supported for file THP.
> >>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>> vm_area_struct *vma,
> >>>>                         enforce_sysfs, orders);
> >>>>   }
> >>>>   +enum thp_event_item {
> >>>> +    THP_ANON_ALLOC_SUCCESS,
> >>>> +    THP_ANON_ALLOC_FAIL,
> >>>> +    NR_THP_EVENT_ITEMS
> >>>> +};
> >>>
> >>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> >>> different to the ones in vm_event_item.h, like
> >>>
> >>> enum thp_event {
> >>>     THP_EVENT_ANON_ALLOC_SUCCESS,
> >>>     THP_EVENT_ANON_ALLOC_FAIL,
> >>>     __THP_EVENT_COUNT,
> >>> };
> >>
> >> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> >> increments, but "stat" can increment and decrement. An event is a type of stat.
> >>
> >> You are only adding events for now, but we have identified a need for inc/dec
> >> stats that will be added in future.
> >
> > What about the below?
> >
> > enum thp_stat {
> >    THP_EVENT_ANON_ALLOC,
> >    THP_EVENT_ANON_ALLOC_FALLBACK,
> >    THP_EVENT_SWPOUT,
> >    THP_EVENT_SWPOUT_FALLBACK,
> >    ...
> >    THP_NR_ANON_PAGES,
> >    THP_NR_FILE_PAGES,
>
> I find this ambiguous; Is it the number of THPs or the number of base pages?

Indeed. I've noticed that the kernel seems rather unpredictable in terms of both
the number of THPs and the number of base pages.

for example:

__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
__node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);

add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);

__lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);

The previous statements pertain to base pages encountered when
dealing with a large folio.;

but the below means THP number:
__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);

My feeling is that when assessing the true memory size, base pages are
the primary
consideration.
However, I haven't addressed this aspect yet, and we can defer this
discussion until
I delve into it further upon arrival :-)

Currently, I'm exclusively employing it as a demonstration to
differentiate between
"event" and "stat", aiming to provide a glimpse into the future perspective.

>
> I think David made the point about incorporating the enum name into the labels
> too, so that there can be no namespace confusion. How about:
>
> <enum>_<type>_<name>
>
> So:
>
> THP_STAT_EV_ANON_ALLOC
> THP_STAT_EV_ANON_ALLOC_FALLBACK
> THP_STAT_EV_ANON_PARTIAL
> THP_STAT_EV_SWPOUT
> THP_STAT_EV_SWPOUT_FALLBACK
> ...
> THP_STAT_NR_ANON
> THP_STAT_NR_FILE
> ...
> __THP_STAT_COUNT
>
> I'm not sure if we need to prefix SWPOUT with ANON? (might we additionally want
> SHMEM in future?
>
> I'm aware I'm bikeshedding at this point... sorry :)

No problem at all. I'm open to discussing it further so that I can better
prepare for version 4.

>
>
> >    ...
> >    __THP_STAT_COUNT,
> > };
> >
> > THP_EVENT can only increase; THP_NR_ANON/FILE_PAGES can increase or
> > decrease. Thus, their names have no "EVENT".
> >

Thanks
Barry
Barry Song April 5, 2024, 2:57 a.m. UTC | #7
On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 04.04.24 09:21, Ryan Roberts wrote:
> > On 03/04/2024 22:00, Barry Song wrote:
> >> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> On 03/04/2024 09:22, David Hildenbrand wrote:
> >>>> On 03.04.24 05:55, Barry Song wrote:
> >>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>
> >>>>> Profiling a system blindly with mTHP has become challenging due
> >>>>> to the lack of visibility into its operations. Presenting the
> >>>>> success rate of mTHP allocations appears to be pressing need.
> >>>>>
> >>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>> performance improvements and regressions without these figures.
> >>>>> It's crucial for us to understand the true effectiveness of
> >>>>> mTHP in real-world scenarios, especially in systems with
> >>>>> fragmented memory.
> >>>>>
> >>>>> This patch sets up the framework for per-order mTHP counters,
> >>>>> starting with the introduction of anon_alloc_success and
> >>>>> anon_alloc_fail counters.  Incorporating additional counters
> >>>>> should now be straightforward as well.
> >>>>>
> >>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>> ---
> >>>>>    -v3:
> >>>>>    * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>>>>    * rename to anon_alloc as right now we only support anon to address
> >>>>>      David's comment;
> >>>>>    * drop a redundant "else", Ryan
> >>>>>
> >>>>>    include/linux/huge_mm.h | 18 ++++++++++++++
> >>>>>    mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>>>>    mm/memory.c             |  2 ++
> >>>>>    3 files changed, 74 insertions(+)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index e896ca4760f6..5e9af6be9537 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>>     * (which is a limitation of the THP implementation).
> >>>>>     */
> >>>>>    #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >>>>> +#define THP_MIN_ORDER        2
> >>>>>      /*
> >>>>>     * Mask of all large folio orders supported for file THP.
> >>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>>> vm_area_struct *vma,
> >>>>>                          enforce_sysfs, orders);
> >>>>>    }
> >>>>>    +enum thp_event_item {
> >>>>> +    THP_ANON_ALLOC_SUCCESS,
> >>>>> +    THP_ANON_ALLOC_FAIL,
> >>>>> +    NR_THP_EVENT_ITEMS
> >>>>> +};
> >>>>
> >>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> >>>> different to the ones in vm_event_item.h, like
> >>>>
> >>>> enum thp_event {
> >>>>      THP_EVENT_ANON_ALLOC_SUCCESS,
> >>>>      THP_EVENT_ANON_ALLOC_FAIL,
> >>>>      __THP_EVENT_COUNT,
> >>>> };
> >>>
> >>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> >>> increments, but "stat" can increment and decrement. An event is a type of stat.
> >>>
> >>> You are only adding events for now, but we have identified a need for inc/dec
> >>> stats that will be added in future.
> >>
> >> What about the below?
> >>
> >> enum thp_stat {

It seems we still need to use enum thp_stat_item rather than thp_stat.
This follows
enum zone_stat_item
enum numa_stat_item
enum node_stat_item

And most importantly, the below looks much better

       enum thp_stat_item {
              THP_STAT_ANON_ALLOC,
              THP_STAT_ANON_ALLOC_FALLBACK,
              __THP_STAT_COUNT
       };

       struct thp_state {
              unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
       };

       DECLARE_PER_CPU(struct thp_state, thp_states);

than

       enum thp_stat {
              THP_STAT_ANON_ALLOC,
              THP_STAT_ANON_ALLOC_FALLBACK,
              __THP_STAT_COUNT
       };

       struct thp_state {
              unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
       };

> >>     THP_EVENT_ANON_ALLOC,
> >>     THP_EVENT_ANON_ALLOC_FALLBACK,
> >>     THP_EVENT_SWPOUT,
> >>     THP_EVENT_SWPOUT_FALLBACK,
> >>     ...
> >>     THP_NR_ANON_PAGES,
> >>     THP_NR_FILE_PAGES,
> >
> > I find this ambiguous; Is it the number of THPs or the number of base pages?
> >
> > I think David made the point about incorporating the enum name into the labels
> > too, so that there can be no namespace confusion. How about:
> >
> > <enum>_<type>_<name>
> >
> > So:
> >
> > THP_STAT_EV_ANON_ALLOC
> > THP_STAT_EV_ANON_ALLOC_FALLBACK
> > THP_STAT_EV_ANON_PARTIAL
> > THP_STAT_EV_SWPOUT
> > THP_STAT_EV_SWPOUT_FALLBACK
> > ...
> > THP_STAT_NR_ANON
> > THP_STAT_NR_FILE
> > ...
> > __THP_STAT_COUNT
>
> I'd even drop the "EV". "NR_ANON" vs "ANON_ALLOC" etc. is expressive enough.

ok.

>
> --
> Cheers,
>
> David / dhildenb

Thanks
Barry
Barry Song April 5, 2024, 4:01 a.m. UTC | #8
On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 04.04.24 09:21, Ryan Roberts wrote:
> > > On 03/04/2024 22:00, Barry Song wrote:
> > >> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> > >>>
> > >>> On 03/04/2024 09:22, David Hildenbrand wrote:
> > >>>> On 03.04.24 05:55, Barry Song wrote:
> > >>>>> From: Barry Song <v-songbaohua@oppo.com>
> > >>>>>
> > >>>>> Profiling a system blindly with mTHP has become challenging due
> > >>>>> to the lack of visibility into its operations. Presenting the
> > >>>>> success rate of mTHP allocations appears to be pressing need.
> > >>>>>
> > >>>>> Recently, I've been experiencing significant difficulty debugging
> > >>>>> performance improvements and regressions without these figures.
> > >>>>> It's crucial for us to understand the true effectiveness of
> > >>>>> mTHP in real-world scenarios, especially in systems with
> > >>>>> fragmented memory.
> > >>>>>
> > >>>>> This patch sets up the framework for per-order mTHP counters,
> > >>>>> starting with the introduction of anon_alloc_success and
> > >>>>> anon_alloc_fail counters.  Incorporating additional counters
> > >>>>> should now be straightforward as well.
> > >>>>>
> > >>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > >>>>> ---
> > >>>>>    -v3:
> > >>>>>    * save some memory as order-0 and order-1 can't be THP, Ryan;
> > >>>>>    * rename to anon_alloc as right now we only support anon to address
> > >>>>>      David's comment;
> > >>>>>    * drop a redundant "else", Ryan
> > >>>>>
> > >>>>>    include/linux/huge_mm.h | 18 ++++++++++++++
> > >>>>>    mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> > >>>>>    mm/memory.c             |  2 ++
> > >>>>>    3 files changed, 74 insertions(+)
> > >>>>>
> > >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > >>>>> index e896ca4760f6..5e9af6be9537 100644
> > >>>>> --- a/include/linux/huge_mm.h
> > >>>>> +++ b/include/linux/huge_mm.h
> > >>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> > >>>>>     * (which is a limitation of the THP implementation).
> > >>>>>     */
> > >>>>>    #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> > >>>>> +#define THP_MIN_ORDER        2
> > >>>>>      /*
> > >>>>>     * Mask of all large folio orders supported for file THP.
> > >>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> > >>>>> vm_area_struct *vma,
> > >>>>>                          enforce_sysfs, orders);
> > >>>>>    }
> > >>>>>    +enum thp_event_item {
> > >>>>> +    THP_ANON_ALLOC_SUCCESS,
> > >>>>> +    THP_ANON_ALLOC_FAIL,
> > >>>>> +    NR_THP_EVENT_ITEMS
> > >>>>> +};
> > >>>>
> > >>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> > >>>> different to the ones in vm_event_item.h, like
> > >>>>
> > >>>> enum thp_event {
> > >>>>      THP_EVENT_ANON_ALLOC_SUCCESS,
> > >>>>      THP_EVENT_ANON_ALLOC_FAIL,
> > >>>>      __THP_EVENT_COUNT,
> > >>>> };
> > >>>
> > >>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> > >>> increments, but "stat" can increment and decrement. An event is a type of stat.
> > >>>
> > >>> You are only adding events for now, but we have identified a need for inc/dec
> > >>> stats that will be added in future.
> > >>
> > >> What about the below?
> > >>
> > >> enum thp_stat {
>
> It seems we still need to use enum thp_stat_item rather than thp_stat.
> This follows
> enum zone_stat_item
> enum numa_stat_item
> enum node_stat_item
>
> And most importantly, the below looks much better
>
>        enum thp_stat_item {
>               THP_STAT_ANON_ALLOC,
>               THP_STAT_ANON_ALLOC_FALLBACK,
>               __THP_STAT_COUNT
>        };
>
>        struct thp_state {
>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
>        };
>
>        DECLARE_PER_CPU(struct thp_state, thp_states);
>
> than
>
>        enum thp_stat {
>               THP_STAT_ANON_ALLOC,
>               THP_STAT_ANON_ALLOC_FALLBACK,
>               __THP_STAT_COUNT
>        };
>
>        struct thp_state {
>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
>        };
>
> > >>     THP_EVENT_ANON_ALLOC,
> > >>     THP_EVENT_ANON_ALLOC_FALLBACK,
> > >>     THP_EVENT_SWPOUT,
> > >>     THP_EVENT_SWPOUT_FALLBACK,
> > >>     ...
> > >>     THP_NR_ANON_PAGES,
> > >>     THP_NR_FILE_PAGES,
> > >
> > > I find this ambiguous; Is it the number of THPs or the number of base pages?
> > >
> > > I think David made the point about incorporating the enum name into the labels
> > > too, so that there can be no namespace confusion. How about:
> > >
> > > <enum>_<type>_<name>
> > >
> > > So:
> > >
> > > THP_STAT_EV_ANON_ALLOC
> > > THP_STAT_EV_ANON_ALLOC_FALLBACK
> > > THP_STAT_EV_ANON_PARTIAL
> > > THP_STAT_EV_SWPOUT
> > > THP_STAT_EV_SWPOUT_FALLBACK
> > > ...
> > > THP_STAT_NR_ANON
> > > THP_STAT_NR_FILE
> > > ...
> > > __THP_STAT_COUNT
> >
> > I'd even drop the "EV". "NR_ANON" vs "ANON_ALLOC" etc. is expressive enough.
>
> ok.

Hi David, Ryan,

I've named everything as follows. Please let me know if you have any further
suggestions before I send the updated version :-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..cc13fa14aa32 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
vm_area_struct *vma,
                                          enforce_sysfs, orders);
 }

+enum thp_stat_item {
+       THP_STAT_ANON_ALLOC,
+       THP_STAT_ANON_ALLOC_FALLBACK,
+       __THP_STAT_COUNT
+};
+
+struct thp_state {
+       unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct thp_state, thp_states);
+
+static inline void count_thp_state(int order, enum thp_stat_item item)
+{
+       this_cpu_inc(thp_states.state[order][item]);
+}
+
 #define transparent_hugepage_use_zero_page()                           \
        (transparent_hugepage_flags &                                   \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9d4b2fbf6872..e704b4408181 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,46 @@ static const struct kobj_type thpsize_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
 };

+DEFINE_PER_CPU(struct thp_state, thp_states) = {{{0}}};
+
+static unsigned long sum_thp_states(int order, enum thp_stat_item item)
+{
+       unsigned long sum = 0;
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               struct thp_state *this = &per_cpu(thp_states, cpu);
+
+               sum += this->state[order][item];
+       }
+
+       return sum;
+}
+
+#define THP_STATE_ATTR(_name, _index)                                  \
+static ssize_t _name##_show(struct kobject *kobj,                      \
+                       struct kobj_attribute *attr, char *buf)         \
+{                                                                      \
+       int order = to_thpsize(kobj)->order;                            \
+                                                                       \
+       return sysfs_emit(buf, "%lu\n", sum_thp_states(order, _index)); \
+}                                                                      \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+THP_STATE_ATTR(anon_alloc, THP_STAT_ANON_ALLOC);
+THP_STATE_ATTR(anon_alloc_fallback, THP_STAT_ANON_ALLOC_FALLBACK);

>
> >
> > --
> > Cheers,
> >
> > David / dhildenb

Thanks
Barry
David Hildenbrand April 5, 2024, 6:29 a.m. UTC | #9
On 05.04.24 06:01, Barry Song wrote:
> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 04.04.24 09:21, Ryan Roberts wrote:
>>>> On 03/04/2024 22:00, Barry Song wrote:
>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>>>>>> On 03.04.24 05:55, Barry Song wrote:
>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>
>>>>>>>> Profiling a system blindly with mTHP has become challenging due
>>>>>>>> to the lack of visibility into its operations. Presenting the
>>>>>>>> success rate of mTHP allocations appears to be pressing need.
>>>>>>>>
>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>> performance improvements and regressions without these figures.
>>>>>>>> It's crucial for us to understand the true effectiveness of
>>>>>>>> mTHP in real-world scenarios, especially in systems with
>>>>>>>> fragmented memory.
>>>>>>>>
>>>>>>>> This patch sets up the framework for per-order mTHP counters,
>>>>>>>> starting with the introduction of anon_alloc_success and
>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>>>>>> should now be straightforward as well.
>>>>>>>>
>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>> ---
>>>>>>>>     -v3:
>>>>>>>>     * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>>>>>     * rename to anon_alloc as right now we only support anon to address
>>>>>>>>       David's comment;
>>>>>>>>     * drop a redundant "else", Ryan
>>>>>>>>
>>>>>>>>     include/linux/huge_mm.h | 18 ++++++++++++++
>>>>>>>>     mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>     mm/memory.c             |  2 ++
>>>>>>>>     3 files changed, 74 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>>>>      * (which is a limitation of the THP implementation).
>>>>>>>>      */
>>>>>>>>     #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>>>>> +#define THP_MIN_ORDER        2
>>>>>>>>       /*
>>>>>>>>      * Mask of all large folio orders supported for file THP.
>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>>>>> vm_area_struct *vma,
>>>>>>>>                           enforce_sysfs, orders);
>>>>>>>>     }
>>>>>>>>     +enum thp_event_item {
>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>>>>>> +    THP_ANON_ALLOC_FAIL,
>>>>>>>> +    NR_THP_EVENT_ITEMS
>>>>>>>> +};
>>>>>>>
>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>>>>>> different to the ones in vm_event_item.h, like
>>>>>>>
>>>>>>> enum thp_event {
>>>>>>>       THP_EVENT_ANON_ALLOC_SUCCESS,
>>>>>>>       THP_EVENT_ANON_ALLOC_FAIL,
>>>>>>>       __THP_EVENT_COUNT,
>>>>>>> };
>>>>>>
>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>>>>>> increments, but "stat" can increment and decrement. An event is a type of stat.
>>>>>>
>>>>>> You are only adding events for now, but we have identified a need for inc/dec
>>>>>> stats that will be added in future.
>>>>>
>>>>> What about the below?
>>>>>
>>>>> enum thp_stat {
>>
>> It seems we still need to use enum thp_stat_item rather than thp_stat.
>> This follows
>> enum zone_stat_item
>> enum numa_stat_item
>> enum node_stat_item


Right, we can do that.

[...]

>> ok.
> 
> Hi David, Ryan,
> 
> I've named everything as follows. Please let me know if you have any further
> suggestions before I send the updated version :-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e896ca4760f6..cc13fa14aa32 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
> vm_area_struct *vma,
>                                            enforce_sysfs, orders);
>   }
> 
> +enum thp_stat_item {
> +       THP_STAT_ANON_ALLOC,
> +       THP_STAT_ANON_ALLOC_FALLBACK,
> +       __THP_STAT_COUNT
> +};
> +


Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to 
make it clearer that this is per mTHP size?
Ryan Roberts April 5, 2024, 7:18 a.m. UTC | #10
On 05/04/2024 05:01, Barry Song wrote:
> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 04.04.24 09:21, Ryan Roberts wrote:
>>>> On 03/04/2024 22:00, Barry Song wrote:
>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>>>>>> On 03.04.24 05:55, Barry Song wrote:
>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>
>>>>>>>> Profiling a system blindly with mTHP has become challenging due
>>>>>>>> to the lack of visibility into its operations. Presenting the
>>>>>>>> success rate of mTHP allocations appears to be pressing need.
>>>>>>>>
>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>> performance improvements and regressions without these figures.
>>>>>>>> It's crucial for us to understand the true effectiveness of
>>>>>>>> mTHP in real-world scenarios, especially in systems with
>>>>>>>> fragmented memory.
>>>>>>>>
>>>>>>>> This patch sets up the framework for per-order mTHP counters,
>>>>>>>> starting with the introduction of anon_alloc_success and
>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>>>>>> should now be straightforward as well.
>>>>>>>>
>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>> ---
>>>>>>>>    -v3:
>>>>>>>>    * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>>>>>    * rename to anon_alloc as right now we only support anon to address
>>>>>>>>      David's comment;
>>>>>>>>    * drop a redundant "else", Ryan
>>>>>>>>
>>>>>>>>    include/linux/huge_mm.h | 18 ++++++++++++++
>>>>>>>>    mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>    mm/memory.c             |  2 ++
>>>>>>>>    3 files changed, 74 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>>>>     * (which is a limitation of the THP implementation).
>>>>>>>>     */
>>>>>>>>    #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>>>>> +#define THP_MIN_ORDER        2
>>>>>>>>      /*
>>>>>>>>     * Mask of all large folio orders supported for file THP.
>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>>>>> vm_area_struct *vma,
>>>>>>>>                          enforce_sysfs, orders);
>>>>>>>>    }
>>>>>>>>    +enum thp_event_item {
>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>>>>>> +    THP_ANON_ALLOC_FAIL,
>>>>>>>> +    NR_THP_EVENT_ITEMS
>>>>>>>> +};
>>>>>>>
>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>>>>>> different to the ones in vm_event_item.h, like
>>>>>>>
>>>>>>> enum thp_event {
>>>>>>>      THP_EVENT_ANON_ALLOC_SUCCESS,
>>>>>>>      THP_EVENT_ANON_ALLOC_FAIL,
>>>>>>>      __THP_EVENT_COUNT,
>>>>>>> };
>>>>>>
>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>>>>>> increments, but "stat" can increment and decrement. An event is a type of stat.
>>>>>>
>>>>>> You are only adding events for now, but we have identified a need for inc/dec
>>>>>> stats that will be added in future.
>>>>>
>>>>> What about the below?
>>>>>
>>>>> enum thp_stat {
>>
>> It seems we still need to use enum thp_stat_item rather than thp_stat.
>> This follows
>> enum zone_stat_item
>> enum numa_stat_item
>> enum node_stat_item
>>
>> And most importantly, the below looks much better
>>
>>        enum thp_stat_item {
>>               THP_STAT_ANON_ALLOC,
>>               THP_STAT_ANON_ALLOC_FALLBACK,
>>               __THP_STAT_COUNT
>>        };
>>
>>        struct thp_state {
>>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
>>        };
>>
>>        DECLARE_PER_CPU(struct thp_state, thp_states);
>>
>> than
>>
>>        enum thp_stat {
>>               THP_STAT_ANON_ALLOC,
>>               THP_STAT_ANON_ALLOC_FALLBACK,
>>               __THP_STAT_COUNT
>>        };
>>
>>        struct thp_state {
>>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
>>        };
>>
>>>>>     THP_EVENT_ANON_ALLOC,
>>>>>     THP_EVENT_ANON_ALLOC_FALLBACK,
>>>>>     THP_EVENT_SWPOUT,
>>>>>     THP_EVENT_SWPOUT_FALLBACK,
>>>>>     ...
>>>>>     THP_NR_ANON_PAGES,
>>>>>     THP_NR_FILE_PAGES,
>>>>
>>>> I find this ambiguous; Is it the number of THPs or the number of base pages?
>>>>
>>>> I think David made the point about incorporating the enum name into the labels
>>>> too, so that there can be no namespace confusion. How about:
>>>>
>>>> <enum>_<type>_<name>
>>>>
>>>> So:
>>>>
>>>> THP_STAT_EV_ANON_ALLOC
>>>> THP_STAT_EV_ANON_ALLOC_FALLBACK
>>>> THP_STAT_EV_ANON_PARTIAL
>>>> THP_STAT_EV_SWPOUT
>>>> THP_STAT_EV_SWPOUT_FALLBACK
>>>> ...
>>>> THP_STAT_NR_ANON
>>>> THP_STAT_NR_FILE
>>>> ...
>>>> __THP_STAT_COUNT
>>>
>>> I'd even drop the "EV". "NR_ANON" vs "ANON_ALLOC" etc. is expressive enough.
>>
>> ok.
> 
> Hi David, Ryan,
> 
> I've named everything as follows. Please let me know if you have any further
> suggestions before I send the updated version :-)

Please treat all my comments below as optional - I'm just stating my preference!

> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e896ca4760f6..cc13fa14aa32 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
> vm_area_struct *vma,
>                                           enforce_sysfs, orders);
>  }
> 
> +enum thp_stat_item {
> +       THP_STAT_ANON_ALLOC,
> +       THP_STAT_ANON_ALLOC_FALLBACK,
> +       __THP_STAT_COUNT
> +};
> +
> +struct thp_state {
> +       unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];

Why using "state" here? I think "stats" is more appropriate? (as in short for
"statistics") i.e. `struct thp_stats` and `unsigned long stats[][]`.

> +};
> +
> +DECLARE_PER_CPU(struct thp_state, thp_states);
> +
> +static inline void count_thp_state(int order, enum thp_stat_item item)
> +{
> +       this_cpu_inc(thp_states.state[order][item]);
> +}
> +
>  #define transparent_hugepage_use_zero_page()                           \
>         (transparent_hugepage_flags &                                   \
>          (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9d4b2fbf6872..e704b4408181 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -526,6 +526,46 @@ static const struct kobj_type thpsize_ktype = {
>         .sysfs_ops = &kobj_sysfs_ops,
>  };
> 
> +DEFINE_PER_CPU(struct thp_state, thp_states) = {{{0}}};

Does this need to be explicitly zeroed? Won't that be the default initial state,
just like for regular globals? Perhaps PER_CPU is special?

> +
> +static unsigned long sum_thp_states(int order, enum thp_stat_item item)

Again, I'd call it sum_thp_stats().

> +{
> +       unsigned long sum = 0;
> +       int cpu;
> +
> +       for_each_online_cpu(cpu) {
> +               struct thp_state *this = &per_cpu(thp_states, cpu);
> +
> +               sum += this->state[order][item];
> +       }
> +
> +       return sum;
> +}
> +
> +#define THP_STATE_ATTR(_name, _index)                                  \

And THP_STATS_ATTR(); they are going to live in the "stats" directory after all.

> +static ssize_t _name##_show(struct kobject *kobj,                      \
> +                       struct kobj_attribute *attr, char *buf)         \
> +{                                                                      \
> +       int order = to_thpsize(kobj)->order;                            \
> +                                                                       \
> +       return sysfs_emit(buf, "%lu\n", sum_thp_states(order, _index)); \
> +}                                                                      \
> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +
> +THP_STATE_ATTR(anon_alloc, THP_STAT_ANON_ALLOC);
> +THP_STATE_ATTR(anon_alloc_fallback, THP_STAT_ANON_ALLOC_FALLBACK);
> 
>>
>>>
>>> --
>>> Cheers,
>>>
>>> David / dhildenb
> 
> Thanks
> Barry
Ryan Roberts April 5, 2024, 7:21 a.m. UTC | #11
On 05/04/2024 07:29, David Hildenbrand wrote:
> On 05.04.24 06:01, Barry Song wrote:
>> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>>>
>>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 04.04.24 09:21, Ryan Roberts wrote:
>>>>> On 03/04/2024 22:00, Barry Song wrote:
>>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>
>>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>>>>>>> On 03.04.24 05:55, Barry Song wrote:
>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>
>>>>>>>>> Profiling a system blindly with mTHP has become challenging due
>>>>>>>>> to the lack of visibility into its operations. Presenting the
>>>>>>>>> success rate of mTHP allocations appears to be pressing need.
>>>>>>>>>
>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>>> performance improvements and regressions without these figures.
>>>>>>>>> It's crucial for us to understand the true effectiveness of
>>>>>>>>> mTHP in real-world scenarios, especially in systems with
>>>>>>>>> fragmented memory.
>>>>>>>>>
>>>>>>>>> This patch sets up the framework for per-order mTHP counters,
>>>>>>>>> starting with the introduction of anon_alloc_success and
>>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>>>>>>> should now be straightforward as well.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>> ---
>>>>>>>>>     -v3:
>>>>>>>>>     * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>>>>>>     * rename to anon_alloc as right now we only support anon to address
>>>>>>>>>       David's comment;
>>>>>>>>>     * drop a redundant "else", Ryan
>>>>>>>>>
>>>>>>>>>     include/linux/huge_mm.h | 18 ++++++++++++++
>>>>>>>>>     mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>     mm/memory.c             |  2 ++
>>>>>>>>>     3 files changed, 74 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>>>>>      * (which is a limitation of the THP implementation).
>>>>>>>>>      */
>>>>>>>>>     #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0)
>>>>>>>>> | BIT(1)))
>>>>>>>>> +#define THP_MIN_ORDER        2
>>>>>>>>>       /*
>>>>>>>>>      * Mask of all large folio orders supported for file THP.
>>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>>>>>> vm_area_struct *vma,
>>>>>>>>>                           enforce_sysfs, orders);
>>>>>>>>>     }
>>>>>>>>>     +enum thp_event_item {
>>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>>>>>>> +    THP_ANON_ALLOC_FAIL,
>>>>>>>>> +    NR_THP_EVENT_ITEMS
>>>>>>>>> +};
>>>>>>>>
>>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>>>>>>> different to the ones in vm_event_item.h, like
>>>>>>>>
>>>>>>>> enum thp_event {
>>>>>>>>       THP_EVENT_ANON_ALLOC_SUCCESS,
>>>>>>>>       THP_EVENT_ANON_ALLOC_FAIL,
>>>>>>>>       __THP_EVENT_COUNT,
>>>>>>>> };
>>>>>>>
>>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>>>>>>> increments, but "stat" can increment and decrement. An event is a type of
>>>>>>> stat.
>>>>>>>
>>>>>>> You are only adding events for now, but we have identified a need for
>>>>>>> inc/dec
>>>>>>> stats that will be added in future.
>>>>>>
>>>>>> What about the below?
>>>>>>
>>>>>> enum thp_stat {
>>>
>>> It seems we still need to use enum thp_stat_item rather than thp_stat.
>>> This follows
>>> enum zone_stat_item
>>> enum numa_stat_item
>>> enum node_stat_item
> 
> 
> Right, we can do that.
> 
> [...]
> 
>>> ok.
>>
>> Hi David, Ryan,
>>
>> I've named everything as follows. Please let me know if you have any further
>> suggestions before I send the updated version :-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e896ca4760f6..cc13fa14aa32 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
>> vm_area_struct *vma,
>>                                            enforce_sysfs, orders);
>>   }
>>
>> +enum thp_stat_item {
>> +       THP_STAT_ANON_ALLOC,
>> +       THP_STAT_ANON_ALLOC_FALLBACK,
>> +       __THP_STAT_COUNT
>> +};
>> +
> 
> 
> Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to make it
> clearer that this is per mTHP size?

Perviously we concluded that mTHP was only for commit logs, and in the longer
term we wanted THP to mean the full set of sizes like it does for hugetlb.
Neither "mTHP" nor "multi-size THP" currently exist in the code base.

Personally I'd vote for leaving it as THP.
David Hildenbrand April 5, 2024, 9:04 a.m. UTC | #12
On 05.04.24 09:21, Ryan Roberts wrote:
> On 05/04/2024 07:29, David Hildenbrand wrote:
>> On 05.04.24 06:01, Barry Song wrote:
>>> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 04.04.24 09:21, Ryan Roberts wrote:
>>>>>> On 03/04/2024 22:00, Barry Song wrote:
>>>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>>
>>>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>>>>>>>> On 03.04.24 05:55, Barry Song wrote:
>>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>>
>>>>>>>>>> Profiling a system blindly with mTHP has become challenging due
>>>>>>>>>> to the lack of visibility into its operations. Presenting the
>>>>>>>>>> success rate of mTHP allocations appears to be pressing need.
>>>>>>>>>>
>>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>>>> performance improvements and regressions without these figures.
>>>>>>>>>> It's crucial for us to understand the true effectiveness of
>>>>>>>>>> mTHP in real-world scenarios, especially in systems with
>>>>>>>>>> fragmented memory.
>>>>>>>>>>
>>>>>>>>>> This patch sets up the framework for per-order mTHP counters,
>>>>>>>>>> starting with the introduction of anon_alloc_success and
>>>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>>>>>>>> should now be straightforward as well.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>> ---
>>>>>>>>>>      -v3:
>>>>>>>>>>      * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>>>>>>>      * rename to anon_alloc as right now we only support anon to address
>>>>>>>>>>        David's comment;
>>>>>>>>>>      * drop a redundant "else", Ryan
>>>>>>>>>>
>>>>>>>>>>      include/linux/huge_mm.h | 18 ++++++++++++++
>>>>>>>>>>      mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>      mm/memory.c             |  2 ++
>>>>>>>>>>      3 files changed, 74 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
>>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>>>>>>       * (which is a limitation of the THP implementation).
>>>>>>>>>>       */
>>>>>>>>>>      #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0)
>>>>>>>>>> | BIT(1)))
>>>>>>>>>> +#define THP_MIN_ORDER        2
>>>>>>>>>>        /*
>>>>>>>>>>       * Mask of all large folio orders supported for file THP.
>>>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>>>>>>> vm_area_struct *vma,
>>>>>>>>>>                            enforce_sysfs, orders);
>>>>>>>>>>      }
>>>>>>>>>>      +enum thp_event_item {
>>>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>>>>>>>> +    THP_ANON_ALLOC_FAIL,
>>>>>>>>>> +    NR_THP_EVENT_ITEMS
>>>>>>>>>> +};
>>>>>>>>>
>>>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>>>>>>>> different to the ones in vm_event_item.h, like
>>>>>>>>>
>>>>>>>>> enum thp_event {
>>>>>>>>>        THP_EVENT_ANON_ALLOC_SUCCESS,
>>>>>>>>>        THP_EVENT_ANON_ALLOC_FAIL,
>>>>>>>>>        __THP_EVENT_COUNT,
>>>>>>>>> };
>>>>>>>>
>>>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>>>>>>>> increments, but "stat" can increment and decrement. An event is a type of
>>>>>>>> stat.
>>>>>>>>
>>>>>>>> You are only adding events for now, but we have identified a need for
>>>>>>>> inc/dec
>>>>>>>> stats that will be added in future.
>>>>>>>
>>>>>>> What about the below?
>>>>>>>
>>>>>>> enum thp_stat {
>>>>
>>>> It seems we still need to use enum thp_stat_item rather than thp_stat.
>>>> This follows
>>>> enum zone_stat_item
>>>> enum numa_stat_item
>>>> enum node_stat_item
>>
>>
>> Right, we can do that.
>>
>> [...]
>>
>>>> ok.
>>>
>>> Hi David, Ryan,
>>>
>>> I've named everything as follows. Please let me know if you have any further
>>> suggestions before I send the updated version :-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e896ca4760f6..cc13fa14aa32 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
>>> vm_area_struct *vma,
>>>                                             enforce_sysfs, orders);
>>>    }
>>>
>>> +enum thp_stat_item {
>>> +       THP_STAT_ANON_ALLOC,
>>> +       THP_STAT_ANON_ALLOC_FALLBACK,
>>> +       __THP_STAT_COUNT
>>> +};
>>> +
>>
>>
>> Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to make it
>> clearer that this is per mTHP size?
> 
> Perviously we concluded that mTHP was only for commit logs, and in the longer
> term we wanted THP to mean the full set of sizes like it does for hugetlb.
> Neither "mTHP" nor "multi-size THP" currently exist in the code base.

True, but some indication that these are "per THP size" might be reasonable.
Barry Song April 5, 2024, 9:08 a.m. UTC | #13
On Fri, Apr 5, 2024 at 8:18 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 05/04/2024 05:01, Barry Song wrote:
> > On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
> >>>
> >>> On 04.04.24 09:21, Ryan Roberts wrote:
> >>>> On 03/04/2024 22:00, Barry Song wrote:
> >>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>>
> >>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
> >>>>>>> On 03.04.24 05:55, Barry Song wrote:
> >>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>>
> >>>>>>>> Profiling a system blindly with mTHP has become challenging due
> >>>>>>>> to the lack of visibility into its operations. Presenting the
> >>>>>>>> success rate of mTHP allocations appears to be pressing need.
> >>>>>>>>
> >>>>>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>>>>> performance improvements and regressions without these figures.
> >>>>>>>> It's crucial for us to understand the true effectiveness of
> >>>>>>>> mTHP in real-world scenarios, especially in systems with
> >>>>>>>> fragmented memory.
> >>>>>>>>
> >>>>>>>> This patch sets up the framework for per-order mTHP counters,
> >>>>>>>> starting with the introduction of anon_alloc_success and
> >>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
> >>>>>>>> should now be straightforward as well.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>> ---
> >>>>>>>>    -v3:
> >>>>>>>>    * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>>>>>>>    * rename to anon_alloc as right now we only support anon to address
> >>>>>>>>      David's comment;
> >>>>>>>>    * drop a redundant "else", Ryan
> >>>>>>>>
> >>>>>>>>    include/linux/huge_mm.h | 18 ++++++++++++++
> >>>>>>>>    mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>    mm/memory.c             |  2 ++
> >>>>>>>>    3 files changed, 74 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>>> index e896ca4760f6..5e9af6be9537 100644
> >>>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>>>>>     * (which is a limitation of the THP implementation).
> >>>>>>>>     */
> >>>>>>>>    #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >>>>>>>> +#define THP_MIN_ORDER        2
> >>>>>>>>      /*
> >>>>>>>>     * Mask of all large folio orders supported for file THP.
> >>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>>>>>> vm_area_struct *vma,
> >>>>>>>>                          enforce_sysfs, orders);
> >>>>>>>>    }
> >>>>>>>>    +enum thp_event_item {
> >>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
> >>>>>>>> +    THP_ANON_ALLOC_FAIL,
> >>>>>>>> +    NR_THP_EVENT_ITEMS
> >>>>>>>> +};
> >>>>>>>
> >>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> >>>>>>> different to the ones in vm_event_item.h, like
> >>>>>>>
> >>>>>>> enum thp_event {
> >>>>>>>      THP_EVENT_ANON_ALLOC_SUCCESS,
> >>>>>>>      THP_EVENT_ANON_ALLOC_FAIL,
> >>>>>>>      __THP_EVENT_COUNT,
> >>>>>>> };
> >>>>>>
> >>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> >>>>>> increments, but "stat" can increment and decrement. An event is a type of stat.
> >>>>>>
> >>>>>> You are only adding events for now, but we have identified a need for inc/dec
> >>>>>> stats that will be added in future.
> >>>>>
> >>>>> What about the below?
> >>>>>
> >>>>> enum thp_stat {
> >>
> >> It seems we still need to use enum thp_stat_item rather than thp_stat.
> >> This follows
> >> enum zone_stat_item
> >> enum numa_stat_item
> >> enum node_stat_item
> >>
> >> And most importantly, the below looks much better
> >>
> >>        enum thp_stat_item {
> >>               THP_STAT_ANON_ALLOC,
> >>               THP_STAT_ANON_ALLOC_FALLBACK,
> >>               __THP_STAT_COUNT
> >>        };
> >>
> >>        struct thp_state {
> >>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
> >>        };
> >>
> >>        DECLARE_PER_CPU(struct thp_state, thp_states);
> >>
> >> than
> >>
> >>        enum thp_stat {
> >>               THP_STAT_ANON_ALLOC,
> >>               THP_STAT_ANON_ALLOC_FALLBACK,
> >>               __THP_STAT_COUNT
> >>        };
> >>
> >>        struct thp_state {
> >>               unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
> >>        };
> >>
> >>>>>     THP_EVENT_ANON_ALLOC,
> >>>>>     THP_EVENT_ANON_ALLOC_FALLBACK,
> >>>>>     THP_EVENT_SWPOUT,
> >>>>>     THP_EVENT_SWPOUT_FALLBACK,
> >>>>>     ...
> >>>>>     THP_NR_ANON_PAGES,
> >>>>>     THP_NR_FILE_PAGES,
> >>>>
> >>>> I find this ambiguous; Is it the number of THPs or the number of base pages?
> >>>>
> >>>> I think David made the point about incorporating the enum name into the labels
> >>>> too, so that there can be no namespace confusion. How about:
> >>>>
> >>>> <enum>_<type>_<name>
> >>>>
> >>>> So:
> >>>>
> >>>> THP_STAT_EV_ANON_ALLOC
> >>>> THP_STAT_EV_ANON_ALLOC_FALLBACK
> >>>> THP_STAT_EV_ANON_PARTIAL
> >>>> THP_STAT_EV_SWPOUT
> >>>> THP_STAT_EV_SWPOUT_FALLBACK
> >>>> ...
> >>>> THP_STAT_NR_ANON
> >>>> THP_STAT_NR_FILE
> >>>> ...
> >>>> __THP_STAT_COUNT
> >>>
> >>> I'd even drop the "EV". "NR_ANON" vs "ANON_ALLOC" etc. is expressive enough.
> >>
> >> ok.
> >
> > Hi David, Ryan,
> >
> > I've named everything as follows. Please let me know if you have any further
> > suggestions before I send the updated version :-)
>
> Please treat all my comments below as optional - I'm just stating my preference!
>
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e896ca4760f6..cc13fa14aa32 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
> > vm_area_struct *vma,
> >                                           enforce_sysfs, orders);
> >  }
> >
> > +enum thp_stat_item {
> > +       THP_STAT_ANON_ALLOC,
> > +       THP_STAT_ANON_ALLOC_FALLBACK,
> > +       __THP_STAT_COUNT
> > +};
> > +
> > +struct thp_state {
> > +       unsigned long state[PMD_ORDER + 1][__THP_STAT_COUNT];
>
> Why using "state" here? I think "stats" is more appropriate? (as in short for
> "statistics") i.e. `struct thp_stats` and `unsigned long stats[][]`.

Sounds good.

>
> > +};
> > +
> > +DECLARE_PER_CPU(struct thp_state, thp_states);
> > +
> > +static inline void count_thp_state(int order, enum thp_stat_item item)
> > +{
> > +       this_cpu_inc(thp_states.state[order][item]);
> > +}
> > +
> >  #define transparent_hugepage_use_zero_page()                           \
> >         (transparent_hugepage_flags &                                   \
> >          (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 9d4b2fbf6872..e704b4408181 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -526,6 +526,46 @@ static const struct kobj_type thpsize_ktype = {
> >         .sysfs_ops = &kobj_sysfs_ops,
> >  };
> >
> > +DEFINE_PER_CPU(struct thp_state, thp_states) = {{{0}}};
>
> Does this need to be explicitly zeroed? Won't that be the default initial state,
> just like for regular globals? Perhaps PER_CPU is special?

I really don't know. uninitialized data is usually put in bss section so kernel
will memset them to 0. but uninitialized per-cpu data is still in .data..percpu
section and I can't find where memset is done in this area as this section
can also contain initialized per-cpu data. Will compilers fill zero for
uninitialized per-cpu data automatically so kernel just thinks they are
initialized 0 data?

anyway, I copied the code from vmstat
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};

>
> > +
> > +static unsigned long sum_thp_states(int order, enum thp_stat_item item)
>
> Again, I'd call it sum_thp_stats().

sounds good.

>
> > +{
> > +       unsigned long sum = 0;
> > +       int cpu;
> > +
> > +       for_each_online_cpu(cpu) {
> > +               struct thp_state *this = &per_cpu(thp_states, cpu);
> > +
> > +               sum += this->state[order][item];
> > +       }
> > +
> > +       return sum;
> > +}
> > +
> > +#define THP_STATE_ATTR(_name, _index)                                  \
>
> And THP_STATS_ATTR(); they are going to live in the "stats" directory after all.

sounds good.

>
> > +static ssize_t _name##_show(struct kobject *kobj,                      \
> > +                       struct kobj_attribute *attr, char *buf)         \
> > +{                                                                      \
> > +       int order = to_thpsize(kobj)->order;                            \
> > +                                                                       \
> > +       return sysfs_emit(buf, "%lu\n", sum_thp_states(order, _index)); \
> > +}                                                                      \
> > +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> > +
> > +THP_STATE_ATTR(anon_alloc, THP_STAT_ANON_ALLOC);
> > +THP_STATE_ATTR(anon_alloc_fallback, THP_STAT_ANON_ALLOC_FALLBACK);
> >
> >>
> >>>
> >>> --
> >>> Cheers,
> >>>
> >>> David / dhildenb
> >
> > Thanks
> > Barry
>
Barry Song April 5, 2024, 9:24 a.m. UTC | #14
On Fri, Apr 5, 2024 at 10:04 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 05.04.24 09:21, Ryan Roberts wrote:
> > On 05/04/2024 07:29, David Hildenbrand wrote:
> >> On 05.04.24 06:01, Barry Song wrote:
> >>> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
> >>>>
> >>>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>>
> >>>>> On 04.04.24 09:21, Ryan Roberts wrote:
> >>>>>> On 03/04/2024 22:00, Barry Song wrote:
> >>>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>>>>
> >>>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
> >>>>>>>>> On 03.04.24 05:55, Barry Song wrote:
> >>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>>>>
> >>>>>>>>>> Profiling a system blindly with mTHP has become challenging due
> >>>>>>>>>> to the lack of visibility into its operations. Presenting the
> >>>>>>>>>> success rate of mTHP allocations appears to be pressing need.
> >>>>>>>>>>
> >>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>>>>>>> performance improvements and regressions without these figures.
> >>>>>>>>>> It's crucial for us to understand the true effectiveness of
> >>>>>>>>>> mTHP in real-world scenarios, especially in systems with
> >>>>>>>>>> fragmented memory.
> >>>>>>>>>>
> >>>>>>>>>> This patch sets up the framework for per-order mTHP counters,
> >>>>>>>>>> starting with the introduction of anon_alloc_success and
> >>>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
> >>>>>>>>>> should now be straightforward as well.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>>>> ---
> >>>>>>>>>>      -v3:
> >>>>>>>>>>      * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>>>>>>>>>      * rename to anon_alloc as right now we only support anon to address
> >>>>>>>>>>        David's comment;
> >>>>>>>>>>      * drop a redundant "else", Ryan
> >>>>>>>>>>
> >>>>>>>>>>      include/linux/huge_mm.h | 18 ++++++++++++++
> >>>>>>>>>>      mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>>>      mm/memory.c             |  2 ++
> >>>>>>>>>>      3 files changed, 74 insertions(+)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
> >>>>>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>>>>>>>       * (which is a limitation of the THP implementation).
> >>>>>>>>>>       */
> >>>>>>>>>>      #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0)
> >>>>>>>>>> | BIT(1)))
> >>>>>>>>>> +#define THP_MIN_ORDER        2
> >>>>>>>>>>        /*
> >>>>>>>>>>       * Mask of all large folio orders supported for file THP.
> >>>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>>>>>>>> vm_area_struct *vma,
> >>>>>>>>>>                            enforce_sysfs, orders);
> >>>>>>>>>>      }
> >>>>>>>>>>      +enum thp_event_item {
> >>>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
> >>>>>>>>>> +    THP_ANON_ALLOC_FAIL,
> >>>>>>>>>> +    NR_THP_EVENT_ITEMS
> >>>>>>>>>> +};
> >>>>>>>>>
> >>>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> >>>>>>>>> different to the ones in vm_event_item.h, like
> >>>>>>>>>
> >>>>>>>>> enum thp_event {
> >>>>>>>>>        THP_EVENT_ANON_ALLOC_SUCCESS,
> >>>>>>>>>        THP_EVENT_ANON_ALLOC_FAIL,
> >>>>>>>>>        __THP_EVENT_COUNT,
> >>>>>>>>> };
> >>>>>>>>
> >>>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> >>>>>>>> increments, but "stat" can increment and decrement. An event is a type of
> >>>>>>>> stat.
> >>>>>>>>
> >>>>>>>> You are only adding events for now, but we have identified a need for
> >>>>>>>> inc/dec
> >>>>>>>> stats that will be added in future.
> >>>>>>>
> >>>>>>> What about the below?
> >>>>>>>
> >>>>>>> enum thp_stat {
> >>>>
> >>>> It seems we still need to use enum thp_stat_item rather than thp_stat.
> >>>> This follows
> >>>> enum zone_stat_item
> >>>> enum numa_stat_item
> >>>> enum node_stat_item
> >>
> >>
> >> Right, we can do that.
> >>
> >> [...]
> >>
> >>>> ok.
> >>>
> >>> Hi David, Ryan,
> >>>
> >>> I've named everything as follows. Please let me know if you have any further
> >>> suggestions before I send the updated version :-)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index e896ca4760f6..cc13fa14aa32 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>> vm_area_struct *vma,
> >>>                                             enforce_sysfs, orders);
> >>>    }
> >>>
> >>> +enum thp_stat_item {
> >>> +       THP_STAT_ANON_ALLOC,
> >>> +       THP_STAT_ANON_ALLOC_FALLBACK,
> >>> +       __THP_STAT_COUNT
> >>> +};
> >>> +
> >>
> >>
> >> Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to make it
> >> clearer that this is per mTHP size?
> >
> > Perviously we concluded that mTHP was only for commit logs, and in the longer
> > term we wanted THP to mean the full set of sizes like it does for hugetlb.
> > Neither "mTHP" nor "multi-size THP" currently exist in the code base.
>
> True, but some indication that these are "per THP size" might be reasonable.

It seems quite reasonable, especially considering the existence of THP counters.
Additionally, there's even an overlap between THP counters and mTHP counters,
specifically for PMD_ORDER. This complexity could potentially confuse
readers of the code.

I'm perfectly fine with using "M/m" as a prefix, as long as sysfs
hasn't exposed it.
Moreover, "M/m" remains a preferable prefix given that "PER_SIZE" is
overly long,
and people are already familiar with the concept of mTHP.

>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand April 5, 2024, 10:15 a.m. UTC | #15
On 05.04.24 11:24, Barry Song wrote:
> On Fri, Apr 5, 2024 at 10:04 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 05.04.24 09:21, Ryan Roberts wrote:
>>> On 05/04/2024 07:29, David Hildenbrand wrote:
>>>> On 05.04.24 06:01, Barry Song wrote:
>>>>> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>
>>>>>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>
>>>>>>> On 04.04.24 09:21, Ryan Roberts wrote:
>>>>>>>> On 03/04/2024 22:00, Barry Song wrote:
>>>>>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
>>>>>>>>>>> On 03.04.24 05:55, Barry Song wrote:
>>>>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>>>>
>>>>>>>>>>>> Profiling a system blindly with mTHP has become challenging due
>>>>>>>>>>>> to the lack of visibility into its operations. Presenting the
>>>>>>>>>>>> success rate of mTHP allocations appears to be pressing need.
>>>>>>>>>>>>
>>>>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>>>>>> performance improvements and regressions without these figures.
>>>>>>>>>>>> It's crucial for us to understand the true effectiveness of
>>>>>>>>>>>> mTHP in real-world scenarios, especially in systems with
>>>>>>>>>>>> fragmented memory.
>>>>>>>>>>>>
>>>>>>>>>>>> This patch sets up the framework for per-order mTHP counters,
>>>>>>>>>>>> starting with the introduction of anon_alloc_success and
>>>>>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
>>>>>>>>>>>> should now be straightforward as well.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>       -v3:
>>>>>>>>>>>>       * save some memory as order-0 and order-1 can't be THP, Ryan;
>>>>>>>>>>>>       * rename to anon_alloc as right now we only support anon to address
>>>>>>>>>>>>         David's comment;
>>>>>>>>>>>>       * drop a redundant "else", Ryan
>>>>>>>>>>>>
>>>>>>>>>>>>       include/linux/huge_mm.h | 18 ++++++++++++++
>>>>>>>>>>>>       mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>>>       mm/memory.c             |  2 ++
>>>>>>>>>>>>       3 files changed, 74 insertions(+)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
>>>>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>>>>>>>>        * (which is a limitation of the THP implementation).
>>>>>>>>>>>>        */
>>>>>>>>>>>>       #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0)
>>>>>>>>>>>> | BIT(1)))
>>>>>>>>>>>> +#define THP_MIN_ORDER        2
>>>>>>>>>>>>         /*
>>>>>>>>>>>>        * Mask of all large folio orders supported for file THP.
>>>>>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>>>>>>>>> vm_area_struct *vma,
>>>>>>>>>>>>                             enforce_sysfs, orders);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       +enum thp_event_item {
>>>>>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
>>>>>>>>>>>> +    THP_ANON_ALLOC_FAIL,
>>>>>>>>>>>> +    NR_THP_EVENT_ITEMS
>>>>>>>>>>>> +};
>>>>>>>>>>>
>>>>>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
>>>>>>>>>>> different to the ones in vm_event_item.h, like
>>>>>>>>>>>
>>>>>>>>>>> enum thp_event {
>>>>>>>>>>>         THP_EVENT_ANON_ALLOC_SUCCESS,
>>>>>>>>>>>         THP_EVENT_ANON_ALLOC_FAIL,
>>>>>>>>>>>         __THP_EVENT_COUNT,
>>>>>>>>>>> };
>>>>>>>>>>
>>>>>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
>>>>>>>>>> increments, but "stat" can increment and decrement. An event is a type of
>>>>>>>>>> stat.
>>>>>>>>>>
>>>>>>>>>> You are only adding events for now, but we have identified a need for
>>>>>>>>>> inc/dec
>>>>>>>>>> stats that will be added in future.
>>>>>>>>>
>>>>>>>>> What about the below?
>>>>>>>>>
>>>>>>>>> enum thp_stat {
>>>>>>
>>>>>> It seems we still need to use enum thp_stat_item rather than thp_stat.
>>>>>> This follows
>>>>>> enum zone_stat_item
>>>>>> enum numa_stat_item
>>>>>> enum node_stat_item
>>>>
>>>>
>>>> Right, we can do that.
>>>>
>>>> [...]
>>>>
>>>>>> ok.
>>>>>
>>>>> Hi David, Ryan,
>>>>>
>>>>> I've named everything as follows. Please let me know if you have any further
>>>>> suggestions before I send the updated version :-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index e896ca4760f6..cc13fa14aa32 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
>>>>> vm_area_struct *vma,
>>>>>                                              enforce_sysfs, orders);
>>>>>     }
>>>>>
>>>>> +enum thp_stat_item {
>>>>> +       THP_STAT_ANON_ALLOC,
>>>>> +       THP_STAT_ANON_ALLOC_FALLBACK,
>>>>> +       __THP_STAT_COUNT
>>>>> +};
>>>>> +
>>>>
>>>>
>>>> Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to make it
>>>> clearer that this is per mTHP size?
>>>
>>> Perviously we concluded that mTHP was only for commit logs, and in the longer
>>> term we wanted THP to mean the full set of sizes like it does for hugetlb.
>>> Neither "mTHP" nor "multi-size THP" currently exist in the code base.
>>
>> True, but some indication that these are "per THP size" might be reasonable.
> 
> It seems quite reasonable, especially considering the existence of THP counters.
> Additionally, there's even an overlap between THP counters and mTHP counters,
> specifically for PMD_ORDER. This complexity could potentially confuse
> readers of the code.
> 
> I'm perfectly fine with using "M/m" as a prefix, as long as sysfs
> hasn't exposed it.
> Moreover, "M/m" remains a preferable prefix given that "PER_SIZE" is
> overly long,
> and people are already familiar with the concept of mTHP.

I think the important part is to not use "mTHP" in any of our ABI. As we 
expose these stats per THP size, using mTHP internally should be ok. But 
I'm happy to hear alternatives that express "per THP size" in a better way.
Barry Song April 5, 2024, 10:51 a.m. UTC | #16
On Fri, Apr 5, 2024 at 11:15 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 05.04.24 11:24, Barry Song wrote:
> > On Fri, Apr 5, 2024 at 10:04 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 05.04.24 09:21, Ryan Roberts wrote:
> >>> On 05/04/2024 07:29, David Hildenbrand wrote:
> >>>> On 05.04.24 06:01, Barry Song wrote:
> >>>>> On Fri, Apr 5, 2024 at 3:57 PM Barry Song <21cnbao@gmail.com> wrote:
> >>>>>>
> >>>>>> On Fri, Apr 5, 2024 at 4:31 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>>>>
> >>>>>>> On 04.04.24 09:21, Ryan Roberts wrote:
> >>>>>>>> On 03/04/2024 22:00, Barry Song wrote:
> >>>>>>>>> On Thu, Apr 4, 2024 at 12:48 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 03/04/2024 09:22, David Hildenbrand wrote:
> >>>>>>>>>>> On 03.04.24 05:55, Barry Song wrote:
> >>>>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>>>>>>
> >>>>>>>>>>>> Profiling a system blindly with mTHP has become challenging due
> >>>>>>>>>>>> to the lack of visibility into its operations. Presenting the
> >>>>>>>>>>>> success rate of mTHP allocations appears to be pressing need.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>>>>>>>>> performance improvements and regressions without these figures.
> >>>>>>>>>>>> It's crucial for us to understand the true effectiveness of
> >>>>>>>>>>>> mTHP in real-world scenarios, especially in systems with
> >>>>>>>>>>>> fragmented memory.
> >>>>>>>>>>>>
> >>>>>>>>>>>> This patch sets up the framework for per-order mTHP counters,
> >>>>>>>>>>>> starting with the introduction of anon_alloc_success and
> >>>>>>>>>>>> anon_alloc_fail counters.  Incorporating additional counters
> >>>>>>>>>>>> should now be straightforward as well.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>       -v3:
> >>>>>>>>>>>>       * save some memory as order-0 and order-1 can't be THP, Ryan;
> >>>>>>>>>>>>       * rename to anon_alloc as right now we only support anon to address
> >>>>>>>>>>>>         David's comment;
> >>>>>>>>>>>>       * drop a redundant "else", Ryan
> >>>>>>>>>>>>
> >>>>>>>>>>>>       include/linux/huge_mm.h | 18 ++++++++++++++
> >>>>>>>>>>>>       mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>>>>>       mm/memory.c             |  2 ++
> >>>>>>>>>>>>       3 files changed, 74 insertions(+)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>>>>>>> index e896ca4760f6..5e9af6be9537 100644
> >>>>>>>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>>>>>>> @@ -70,6 +70,7 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>>>>>>>>>        * (which is a limitation of the THP implementation).
> >>>>>>>>>>>>        */
> >>>>>>>>>>>>       #define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0)
> >>>>>>>>>>>> | BIT(1)))
> >>>>>>>>>>>> +#define THP_MIN_ORDER        2
> >>>>>>>>>>>>         /*
> >>>>>>>>>>>>        * Mask of all large folio orders supported for file THP.
> >>>>>>>>>>>> @@ -264,6 +265,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>>>>>>>>>> vm_area_struct *vma,
> >>>>>>>>>>>>                             enforce_sysfs, orders);
> >>>>>>>>>>>>       }
> >>>>>>>>>>>>       +enum thp_event_item {
> >>>>>>>>>>>> +    THP_ANON_ALLOC_SUCCESS,
> >>>>>>>>>>>> +    THP_ANON_ALLOC_FAIL,
> >>>>>>>>>>>> +    NR_THP_EVENT_ITEMS
> >>>>>>>>>>>> +};
> >>>>>>>>>>>
> >>>>>>>>>>> Maybe use a prefix that resembles matches the enum name and is "obviously"
> >>>>>>>>>>> different to the ones in vm_event_item.h, like
> >>>>>>>>>>>
> >>>>>>>>>>> enum thp_event {
> >>>>>>>>>>>         THP_EVENT_ANON_ALLOC_SUCCESS,
> >>>>>>>>>>>         THP_EVENT_ANON_ALLOC_FAIL,
> >>>>>>>>>>>         __THP_EVENT_COUNT,
> >>>>>>>>>>> };
> >>>>>>>>>>
> >>>>>>>>>> FWIW, I'd personally replace "event" with "stat". For me "event" only ever
> >>>>>>>>>> increments, but "stat" can increment and decrement. An event is a type of
> >>>>>>>>>> stat.
> >>>>>>>>>>
> >>>>>>>>>> You are only adding events for now, but we have identified a need for
> >>>>>>>>>> inc/dec
> >>>>>>>>>> stats that will be added in future.
> >>>>>>>>>
> >>>>>>>>> What about the below?
> >>>>>>>>>
> >>>>>>>>> enum thp_stat {
> >>>>>>
> >>>>>> It seems we still need to use enum thp_stat_item rather than thp_stat.
> >>>>>> This follows
> >>>>>> enum zone_stat_item
> >>>>>> enum numa_stat_item
> >>>>>> enum node_stat_item
> >>>>
> >>>>
> >>>> Right, we can do that.
> >>>>
> >>>> [...]
> >>>>
> >>>>>> ok.
> >>>>>
> >>>>> Hi David, Ryan,
> >>>>>
> >>>>> I've named everything as follows. Please let me know if you have any further
> >>>>> suggestions before I send the updated version :-)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index e896ca4760f6..cc13fa14aa32 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct
> >>>>> vm_area_struct *vma,
> >>>>>                                              enforce_sysfs, orders);
> >>>>>     }
> >>>>>
> >>>>> +enum thp_stat_item {
> >>>>> +       THP_STAT_ANON_ALLOC,
> >>>>> +       THP_STAT_ANON_ALLOC_FALLBACK,
> >>>>> +       __THP_STAT_COUNT
> >>>>> +};
> >>>>> +
> >>>>
> >>>>
> >>>> Final bikeshedding: so we want to call all of that MTHP_, mthp_ etc, to make it
> >>>> clearer that this is per mTHP size?
> >>>
> >>> Perviously we concluded that mTHP was only for commit logs, and in the longer
> >>> term we wanted THP to mean the full set of sizes like it does for hugetlb.
> >>> Neither "mTHP" nor "multi-size THP" currently exist in the code base.
> >>
> >> True, but some indication that these are "per THP size" might be reasonable.
> >
> > It seems quite reasonable, especially considering the existence of THP counters.
> > Additionally, there's even an overlap between THP counters and mTHP counters,
> > specifically for PMD_ORDER. This complexity could potentially confuse
> > readers of the code.
> >
> > I'm perfectly fine with using "M/m" as a prefix, as long as sysfs
> > hasn't exposed it.
> > Moreover, "M/m" remains a preferable prefix given that "PER_SIZE" is
> > overly long,
> > and people are already familiar with the concept of mTHP.
>
> I think the important part is to not use "mTHP" in any of our ABI. As we
> expose these stats per THP size, using mTHP internally should be ok. But
> I'm happy to hear alternatives that express "per THP size" in a better way.

Indeed, as long as we maintain stability in the node names, the designation
of "mTHP" remains merely an implementation detail. Therefore, in version 4,
I've opted to utilize "mthp" for these macros and variables. Once we discover
a more suitable approach, we can proceed with additional renaming as
necessary :-)

At present, we've implemented mthp allocation within PF and are executing
non-split swapout. It's crucial now for us to assess their effectiveness and
operational scope.

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..5e9af6be9537 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -70,6 +70,7 @@  extern struct kobj_attribute shmem_enabled_attr;
  * (which is a limitation of the THP implementation).
  */
 #define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
+#define THP_MIN_ORDER		2
 
 /*
  * Mask of all large folio orders supported for file THP.
@@ -264,6 +265,23 @@  unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 					  enforce_sysfs, orders);
 }
 
+enum thp_event_item {
+	THP_ANON_ALLOC_SUCCESS,
+	THP_ANON_ALLOC_FAIL,
+	NR_THP_EVENT_ITEMS
+};
+
+struct thp_event_state {
+	unsigned long event[PMD_ORDER + 1 - THP_MIN_ORDER][NR_THP_EVENT_ITEMS];
+};
+
+DECLARE_PER_CPU(struct thp_event_state, thp_event_states);
+
+static inline void count_thp_event(int order, enum thp_event_item item)
+{
+	this_cpu_inc(thp_event_states.event[order - THP_MIN_ORDER][item]);
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3ca9282a0dc9..e283039719f2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,52 @@  static const struct kobj_type thpsize_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 
+DEFINE_PER_CPU(struct thp_event_state, thp_event_states) = {{{0}}};
+
+static unsigned long sum_thp_events(int order, enum thp_event_item item)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct thp_event_state *this = &per_cpu(thp_event_states, cpu);
+
+		sum += this->event[order - THP_MIN_ORDER][item];
+	}
+
+	return sum;
+}
+
+static ssize_t anon_alloc_success_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%lu\n", sum_thp_events(order, THP_ANON_ALLOC_SUCCESS));
+}
+
+static ssize_t anon_alloc_fail_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%lu\n", sum_thp_events(order, THP_ANON_ALLOC_FAIL));
+}
+
+static struct kobj_attribute anon_alloc_success_attr = __ATTR_RO(anon_alloc_success);
+static struct kobj_attribute anon_alloc_fail_attr = __ATTR_RO(anon_alloc_fail);
+
+static struct attribute *stats_attrs[] = {
+	&anon_alloc_success_attr.attr,
+	&anon_alloc_fail_attr.attr,
+	NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+	.name = "stats",
+	.attrs = stats_attrs,
+};
+
 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 {
 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +595,12 @@  static struct thpsize *thpsize_create(int order, struct kobject *parent)
 		return ERR_PTR(ret);
 	}
 
+	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+	if (ret) {
+		kobject_put(&thpsize->kobj);
+		return ERR_PTR(ret);
+	}
+
 	thpsize->order = order;
 	return thpsize;
 }
@@ -1050,8 +1102,10 @@  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
+		count_thp_event(HPAGE_PMD_ORDER, THP_ANON_ALLOC_FAIL);
 		return VM_FAULT_FALLBACK;
 	}
+	count_thp_event(HPAGE_PMD_ORDER, THP_ANON_ALLOC_SUCCESS);
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 912cd738ec03..40de2b7a73ab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4374,8 +4374,10 @@  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 			}
 			folio_throttle_swaprate(folio, gfp);
 			clear_huge_page(&folio->page, vmf->address, 1 << order);
+			count_thp_event(order, THP_ANON_ALLOC_SUCCESS);
 			return folio;
 		}
+		count_thp_event(order, THP_ANON_ALLOC_FAIL);
 next:
 		order = next_order(&orders, order);
 	}