diff mbox series

[1/2] mm: add per-order mTHP split counters

Message ID 20240424135148.30422-2-ioworker0@gmail.com (mailing list archive)
State New
Headers show
Series mm: introduce per-order mTHP split counters | expand

Commit Message

Lance Yang April 24, 2024, 1:51 p.m. UTC
At present, the split counters in THP statistics no longer include
PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
counters to monitor the frequency of mTHP splits. This will assist
developers in better analyzing and optimizing system performance.

/sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
        split_page
        split_page_failed
        deferred_split_page

Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
 include/linux/huge_mm.h |  3 +++
 mm/huge_memory.c        | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

Comments

Ryan Roberts April 24, 2024, 3:41 p.m. UTC | #1
+ Barry

On 24/04/2024 14:51, Lance Yang wrote:
> At present, the split counters in THP statistics no longer include
> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> counters to monitor the frequency of mTHP splits. This will assist
> developers in better analyzing and optimizing system performance.
> 
> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>         split_page
>         split_page_failed
>         deferred_split_page
> 
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>  include/linux/huge_mm.h |  3 +++
>  mm/huge_memory.c        | 14 ++++++++++++--
>  2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 56c7ea73090b..7b9c6590e1f7 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>  	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>  	MTHP_STAT_ANON_SWPOUT,
>  	MTHP_STAT_ANON_SWPOUT_FALLBACK,
> +	MTHP_STAT_SPLIT_PAGE,
> +	MTHP_STAT_SPLIT_PAGE_FAILED,
> +	MTHP_STAT_DEFERRED_SPLIT_PAGE,
>  	__MTHP_STAT_COUNT
>  };
>  
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 055df5aac7c3..52db888e47a6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>  
>  static struct attribute *stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>  	&anon_fault_fallback_charge_attr.attr,
>  	&anon_swpout_attr.attr,
>  	&anon_swpout_fallback_attr.attr,
> +	&split_page_attr.attr,
> +	&split_page_failed_attr.attr,
> +	&deferred_split_page_attr.attr,
>  	NULL,
>  };
>  
> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
> -	bool is_thp = folio_test_pmd_mappable(folio);
> +	int order = folio_order(folio);
>  	int extra_pins, ret;
>  	pgoff_t end;
>  	bool is_hzp;
> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		i_mmap_unlock_read(mapping);
>  out:
>  	xas_destroy(&xas);
> -	if (is_thp)
> +	if (order >= HPAGE_PMD_ORDER)
>  		count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> +	count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> +				      MTHP_STAT_SPLIT_PAGE_FAILED);
>  	return ret;
>  }
>  
> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>  	if (list_empty(&folio->_deferred_list)) {
>  		if (folio_test_pmd_mappable(folio))
>  			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +		count_mthp_stat(folio_order(folio),
> +				MTHP_STAT_DEFERRED_SPLIT_PAGE);

There is a very long conversation with Barry about adding a 'global "mTHP became
partially mapped 1 or more processes" counter (inc only)', which terminates at
[1]. There is a lot of discussion about the required semantics around the need
for partial map to cover alignment and contiguity as well as whether all pages
are mapped, and to trigger once it becomes partial in at least 1 process.

MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
information as a result. Barry, what's your view here? I'm guessing this doesn't
quite solve what you are looking for?

[1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/

Thanks,
Ryan

>  		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>  		ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
Bang Li April 24, 2024, 5:12 p.m. UTC | #2
Hey Lance,

On 2024/4/24 21:51, Lance Yang wrote:

> At present, the split counters in THP statistics no longer include
> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> counters to monitor the frequency of mTHP splits. This will assist
> developers in better analyzing and optimizing system performance.
>
> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>          split_page
>          split_page_failed
>          deferred_split_page
>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>   include/linux/huge_mm.h |  3 +++
>   mm/huge_memory.c        | 14 ++++++++++++--
>   2 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 56c7ea73090b..7b9c6590e1f7 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>   	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>   	MTHP_STAT_ANON_SWPOUT,
>   	MTHP_STAT_ANON_SWPOUT_FALLBACK,
> +	MTHP_STAT_SPLIT_PAGE,
> +	MTHP_STAT_SPLIT_PAGE_FAILED,
> +	MTHP_STAT_DEFERRED_SPLIT_PAGE,
>   	__MTHP_STAT_COUNT
>   };
>   
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 055df5aac7c3..52db888e47a6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>   
>   static struct attribute *stats_attrs[] = {
>   	&anon_fault_alloc_attr.attr,
> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>   	&anon_fault_fallback_charge_attr.attr,
>   	&anon_swpout_attr.attr,
>   	&anon_swpout_fallback_attr.attr,
> +	&split_page_attr.attr,
> +	&split_page_failed_attr.attr,
> +	&deferred_split_page_attr.attr,
>   	NULL,
>   };
>   
> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>   	struct anon_vma *anon_vma = NULL;
>   	struct address_space *mapping = NULL;
> -	bool is_thp = folio_test_pmd_mappable(folio);
> +	int order = folio_order(folio);
>   	int extra_pins, ret;
>   	pgoff_t end;
>   	bool is_hzp;
> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   		i_mmap_unlock_read(mapping);
>   out:
>   	xas_destroy(&xas);
> -	if (is_thp)
> +	if (order >= HPAGE_PMD_ORDER)
>   		count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> +	count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> +				      MTHP_STAT_SPLIT_PAGE_FAILED);
>   	return ret;
>   }
>   
> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>   	if (list_empty(&folio->_deferred_list)) {
>   		if (folio_test_pmd_mappable(folio))
>   			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +		count_mthp_stat(folio_order(folio),
> +				MTHP_STAT_DEFERRED_SPLIT_PAGE);
>   		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>   		ds_queue->split_queue_len++;
>   #ifdef CONFIG_MEMCG

My opinion can be ignored :). Would it be better to modify the 
deferred_split_folio
function as follows? I'm not sure.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 
055df5aac7c3..e8562e8630b1 100644 --- a/mm/huge_memory.c +++ 
b/mm/huge_memory.c @@ -3299,12 +3299,13 @@ void 
deferred_split_folio(struct folio *folio) struct mem_cgroup *memcg = 
folio_memcg(folio); #endif unsigned long flags; + int order = 
folio_order(folio); /* * Order 1 folios have no space for a deferred 
list, but we also * won't waste much memory by not adding them to the 
deferred list. */ - if (folio_order(folio) <= 1) + if (order <= 1) 
return; /* @@ -3325,8 +3326,9 @@ void deferred_split_folio(struct folio 
*folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if 
(list_empty(&folio->_deferred_list)) { - if 
(folio_test_pmd_mappable(folio)) + if (order >= HPAGE_PMD_ORDER) 
count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(order, 
MTHP_STAT_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, 
&ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG thanks,
bang
Bang Li April 24, 2024, 5:58 p.m. UTC | #3
Hey, sorry for making noise, there was something wrong with the format of
the last email.

On 2024/4/25 1:12, Bang Li wrote:
> Hey Lance,
>
> On 2024/4/24 21:51, Lance Yang wrote:
>
>> At present, the split counters in THP statistics no longer include
>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
>> counters to monitor the frequency of mTHP splits. This will assist
>> developers in better analyzing and optimizing system performance.
>>
>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>>          split_page
>>          split_page_failed
>>          deferred_split_page
>>
>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>> ---
>>   include/linux/huge_mm.h |  3 +++
>>   mm/huge_memory.c        | 14 ++++++++++++--
>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 56c7ea73090b..7b9c6590e1f7 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>       MTHP_STAT_ANON_SWPOUT,
>>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
>> +    MTHP_STAT_SPLIT_PAGE,
>> +    MTHP_STAT_SPLIT_PAGE_FAILED,
>> +    MTHP_STAT_DEFERRED_SPLIT_PAGE,
>>       __MTHP_STAT_COUNT
>>   };
>>   diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 055df5aac7c3..52db888e47a6 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, 
>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, 
>> MTHP_STAT_ANON_SWPOUT_FALLBACK);
>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, 
>> MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>     static struct attribute *stats_attrs[] = {
>>       &anon_fault_alloc_attr.attr,
>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>>       &anon_fault_fallback_charge_attr.attr,
>>       &anon_swpout_attr.attr,
>>       &anon_swpout_fallback_attr.attr,
>> +    &split_page_attr.attr,
>> +    &split_page_failed_attr.attr,
>> +    &deferred_split_page_attr.attr,
>>       NULL,
>>   };
>>   @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct 
>> page *page, struct list_head *list,
>>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, 
>> new_order);
>>       struct anon_vma *anon_vma = NULL;
>>       struct address_space *mapping = NULL;
>> -    bool is_thp = folio_test_pmd_mappable(folio);
>> +    int order = folio_order(folio);
>>       int extra_pins, ret;
>>       pgoff_t end;
>>       bool is_hzp;
>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct 
>> page *page, struct list_head *list,
>>           i_mmap_unlock_read(mapping);
>>   out:
>>       xas_destroy(&xas);
>> -    if (is_thp)
>> +    if (order >= HPAGE_PMD_ORDER)
>>           count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>> +    count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
>> +                      MTHP_STAT_SPLIT_PAGE_FAILED);
>>       return ret;
>>   }
>>   @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>>       if (list_empty(&folio->_deferred_list)) {
>>           if (folio_test_pmd_mappable(folio))
>>               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> +        count_mthp_stat(folio_order(folio),
>> +                MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>           list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>           ds_queue->split_queue_len++;
>>   #ifdef CONFIG_MEMCG
>
> My opinion can be ignored :). Would it be better to modify the 
> deferred_split_folio
> function as follows? I'm not sure.
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 
> 055df5aac7c3..e8562e8630b1 100644 --- a/mm/huge_memory.c +++ 
> b/mm/huge_memory.c @@ -3299,12 +3299,13 @@ void 
> deferred_split_folio(struct folio *folio) struct mem_cgroup *memcg = 
> folio_memcg(folio); #endif unsigned long flags; + int order = 
> folio_order(folio); /* * Order 1 folios have no space for a deferred 
> list, but we also * won't waste much memory by not adding them to the 
> deferred list. */ - if (folio_order(folio) <= 1) + if (order <= 1) 
> return; /* @@ -3325,8 +3326,9 @@ void deferred_split_folio(struct 
> folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 
> if (list_empty(&folio->_deferred_list)) { - if 
> (folio_test_pmd_mappable(folio)) + if (order >= HPAGE_PMD_ORDER) 
> count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(order, 
> MTHP_STAT_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, 
> &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef 
> CONFIG_MEMCG thanks,
> bang
>

My opinion can be ignored :). Would it be better to modify the 
deferred_split_folio
function as follows? I'm not sure.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 055df5aac7c3..e8562e8630b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3299,12 +3299,13 @@ void deferred_split_folio(struct folio *folio)
         struct mem_cgroup *memcg = folio_memcg(folio);
  #endif
         unsigned long flags;
+       int order = folio_order(folio);

         /*
          * Order 1 folios have no space for a deferred list, but we also
          * won't waste much memory by not adding them to the deferred list.
          */
-       if (folio_order(folio) <= 1)
+       if (order <= 1)
                 return;

         /*
@@ -3325,8 +3326,9 @@ void deferred_split_folio(struct folio *folio)

         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         if (list_empty(&folio->_deferred_list)) {
-               if (folio_test_pmd_mappable(folio))
+               if (order >= HPAGE_PMD_ORDER)
                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+               count_mthp_stat(order, MTHP_STAT_DEFERRED_SPLIT_PAGE);
                 list_add_tail(&folio->_deferred_list, 
&ds_queue->split_queue);
                 ds_queue->split_queue_len++;
  #ifdef CONFIG_MEMCG

thanks,
bang
Yang Shi April 24, 2024, 7:44 p.m. UTC | #4
On Wed, Apr 24, 2024 at 6:53 AM Lance Yang <ioworker0@gmail.com> wrote:
>
> At present, the split counters in THP statistics no longer include
> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> counters to monitor the frequency of mTHP splits. This will assist
> developers in better analyzing and optimizing system performance.
>
> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>         split_page
>         split_page_failed
>         deferred_split_page

The deferred_split_page counter may easily go insane with the fix from
https://lore.kernel.org/linux-mm/20240411153232.169560-1-zi.yan@sent.com/

Zi Yan,

Will you submit v2 for this patch soon?


>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>  include/linux/huge_mm.h |  3 +++
>  mm/huge_memory.c        | 14 ++++++++++++--
>  2 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 56c7ea73090b..7b9c6590e1f7 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>         MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>         MTHP_STAT_ANON_SWPOUT,
>         MTHP_STAT_ANON_SWPOUT_FALLBACK,
> +       MTHP_STAT_SPLIT_PAGE,
> +       MTHP_STAT_SPLIT_PAGE_FAILED,
> +       MTHP_STAT_DEFERRED_SPLIT_PAGE,
>         __MTHP_STAT_COUNT
>  };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 055df5aac7c3..52db888e47a6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>
>  static struct attribute *stats_attrs[] = {
>         &anon_fault_alloc_attr.attr,
> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>         &anon_fault_fallback_charge_attr.attr,
>         &anon_swpout_attr.attr,
>         &anon_swpout_fallback_attr.attr,
> +       &split_page_attr.attr,
> +       &split_page_failed_attr.attr,
> +       &deferred_split_page_attr.attr,
>         NULL,
>  };
>
> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>         XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>         struct anon_vma *anon_vma = NULL;
>         struct address_space *mapping = NULL;
> -       bool is_thp = folio_test_pmd_mappable(folio);
> +       int order = folio_order(folio);
>         int extra_pins, ret;
>         pgoff_t end;
>         bool is_hzp;
> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                 i_mmap_unlock_read(mapping);
>  out:
>         xas_destroy(&xas);
> -       if (is_thp)
> +       if (order >= HPAGE_PMD_ORDER)
>                 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> +       count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> +                                     MTHP_STAT_SPLIT_PAGE_FAILED);
>         return ret;
>  }
>
> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>         if (list_empty(&folio->_deferred_list)) {
>                 if (folio_test_pmd_mappable(folio))
>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +               count_mthp_stat(folio_order(folio),
> +                               MTHP_STAT_DEFERRED_SPLIT_PAGE);
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> --
> 2.33.1
>
>
Lance Yang April 25, 2024, 4:47 a.m. UTC | #5
Hey Bang,

Thanks for taking time to review!

On Thu, Apr 25, 2024 at 1:59 AM Bang Li <libang.linux@gmail.com> wrote:
>
> Hey, sorry for making noise, there was something wrong with the format of
> the last email.
>
> On 2024/4/25 1:12, Bang Li wrote:
> > Hey Lance,
> >
> > On 2024/4/24 21:51, Lance Yang wrote:
> >
> >> At present, the split counters in THP statistics no longer include
> >> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> >> counters to monitor the frequency of mTHP splits. This will assist
> >> developers in better analyzing and optimizing system performance.
> >>
> >> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >>          split_page
> >>          split_page_failed
> >>          deferred_split_page
> >>
> >> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >> ---
> >>   include/linux/huge_mm.h |  3 +++
> >>   mm/huge_memory.c        | 14 ++++++++++++--
> >>   2 files changed, 15 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index 56c7ea73090b..7b9c6590e1f7 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>       MTHP_STAT_ANON_SWPOUT,
> >>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
> >> +    MTHP_STAT_SPLIT_PAGE,
> >> +    MTHP_STAT_SPLIT_PAGE_FAILED,
> >> +    MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >>       __MTHP_STAT_COUNT
> >>   };
> >>   diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 055df5aac7c3..52db888e47a6 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback,
> >> MTHP_STAT_ANON_FAULT_FALLBACK);
> >>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge,
> >> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback,
> >> MTHP_STAT_ANON_SWPOUT_FALLBACK);
> >> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> >> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> >> +DEFINE_MTHP_STAT_ATTR(deferred_split_page,
> >> MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>     static struct attribute *stats_attrs[] = {
> >>       &anon_fault_alloc_attr.attr,
> >> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >>       &anon_fault_fallback_charge_attr.attr,
> >>       &anon_swpout_attr.attr,
> >>       &anon_swpout_fallback_attr.attr,
> >> +    &split_page_attr.attr,
> >> +    &split_page_failed_attr.attr,
> >> +    &deferred_split_page_attr.attr,
> >>       NULL,
> >>   };
> >>   @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct
> >> page *page, struct list_head *list,
> >>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index,
> >> new_order);
> >>       struct anon_vma *anon_vma = NULL;
> >>       struct address_space *mapping = NULL;
> >> -    bool is_thp = folio_test_pmd_mappable(folio);
> >> +    int order = folio_order(folio);
> >>       int extra_pins, ret;
> >>       pgoff_t end;
> >>       bool is_hzp;
> >> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct
> >> page *page, struct list_head *list,
> >>           i_mmap_unlock_read(mapping);
> >>   out:
> >>       xas_destroy(&xas);
> >> -    if (is_thp)
> >> +    if (order >= HPAGE_PMD_ORDER)
> >>           count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> >> +    count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> >> +                      MTHP_STAT_SPLIT_PAGE_FAILED);
> >>       return ret;
> >>   }
> >>   @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >>       if (list_empty(&folio->_deferred_list)) {
> >>           if (folio_test_pmd_mappable(folio))
> >>               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> +        count_mthp_stat(folio_order(folio),
> >> +                MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>           list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >>           ds_queue->split_queue_len++;
> >>   #ifdef CONFIG_MEMCG
> >
> > My opinion can be ignored :). Would it be better to modify the
> > deferred_split_folio
> > function as follows? I'm not sure.
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c index
> > 055df5aac7c3..e8562e8630b1 100644 --- a/mm/huge_memory.c +++
> > b/mm/huge_memory.c @@ -3299,12 +3299,13 @@ void
> > deferred_split_folio(struct folio *folio) struct mem_cgroup *memcg =
> > folio_memcg(folio); #endif unsigned long flags; + int order =
> > folio_order(folio); /* * Order 1 folios have no space for a deferred
> > list, but we also * won't waste much memory by not adding them to the
> > deferred list. */ - if (folio_order(folio) <= 1) + if (order <= 1)
> > return; /* @@ -3325,8 +3326,9 @@ void deferred_split_folio(struct
> > folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > if (list_empty(&folio->_deferred_list)) { - if
> > (folio_test_pmd_mappable(folio)) + if (order >= HPAGE_PMD_ORDER)
> > count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(order,
> > MTHP_STAT_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list,
> > &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef
> > CONFIG_MEMCG thanks,
> > bang
> >
>
> My opinion can be ignored :). Would it be better to modify the
> deferred_split_folio
> function as follows? I'm not sure.
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 055df5aac7c3..e8562e8630b1 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3299,12 +3299,13 @@ void deferred_split_folio(struct folio *folio)
>          struct mem_cgroup *memcg = folio_memcg(folio);
>   #endif
>          unsigned long flags;
> +       int order = folio_order(folio);

I'll consider storing it in a variable earlier for later reuse.

Thanks,
Lance

>
>          /*
>           * Order 1 folios have no space for a deferred list, but we also
>           * won't waste much memory by not adding them to the deferred list.
>           */
> -       if (folio_order(folio) <= 1)
> +       if (order <= 1)
>                  return;
>
>          /*
> @@ -3325,8 +3326,9 @@ void deferred_split_folio(struct folio *folio)
>
>          spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>          if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> +               if (order >= HPAGE_PMD_ORDER)
>                          count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +               count_mthp_stat(order, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>                  list_add_tail(&folio->_deferred_list,
> &ds_queue->split_queue);
>                  ds_queue->split_queue_len++;
>   #ifdef CONFIG_MEMCG
>
> thanks,
> bang
Lance Yang April 25, 2024, 5:13 a.m. UTC | #6
Hey Yang,

Thanks for taking time to review!

On Thu, Apr 25, 2024 at 3:44 AM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Apr 24, 2024 at 6:53 AM Lance Yang <ioworker0@gmail.com> wrote:
> >
> > At present, the split counters in THP statistics no longer include
> > PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> > counters to monitor the frequency of mTHP splits. This will assist
> > developers in better analyzing and optimizing system performance.
> >
> > /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >         split_page
> >         split_page_failed
> >         deferred_split_page
>
> The deferred_split_page counter may easily go insane with the fix from
> https://lore.kernel.org/linux-mm/20240411153232.169560-1-zi.yan@sent.com/
>
> Zi Yan,
>
> Will you submit v2 for this patch soon?

Thanks for bringing this to my attention!
I'll follow Zi Yan's patch, then submit the next version.

Thanks,
Lance

>
>
> >
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >  include/linux/huge_mm.h |  3 +++
> >  mm/huge_memory.c        | 14 ++++++++++++--
> >  2 files changed, 15 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 56c7ea73090b..7b9c6590e1f7 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >         MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >         MTHP_STAT_ANON_SWPOUT,
> >         MTHP_STAT_ANON_SWPOUT_FALLBACK,
> > +       MTHP_STAT_SPLIT_PAGE,
> > +       MTHP_STAT_SPLIT_PAGE_FAILED,
> > +       MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >         __MTHP_STAT_COUNT
> >  };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 055df5aac7c3..52db888e47a6 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> > +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> > +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> > +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >
> >  static struct attribute *stats_attrs[] = {
> >         &anon_fault_alloc_attr.attr,
> > @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >         &anon_fault_fallback_charge_attr.attr,
> >         &anon_swpout_attr.attr,
> >         &anon_swpout_fallback_attr.attr,
> > +       &split_page_attr.attr,
> > +       &split_page_failed_attr.attr,
> > +       &deferred_split_page_attr.attr,
> >         NULL,
> >  };
> >
> > @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >         XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >         struct anon_vma *anon_vma = NULL;
> >         struct address_space *mapping = NULL;
> > -       bool is_thp = folio_test_pmd_mappable(folio);
> > +       int order = folio_order(folio);
> >         int extra_pins, ret;
> >         pgoff_t end;
> >         bool is_hzp;
> > @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >                 i_mmap_unlock_read(mapping);
> >  out:
> >         xas_destroy(&xas);
> > -       if (is_thp)
> > +       if (order >= HPAGE_PMD_ORDER)
> >                 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> > +       count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> > +                                     MTHP_STAT_SPLIT_PAGE_FAILED);
> >         return ret;
> >  }
> >
> > @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >         if (list_empty(&folio->_deferred_list)) {
> >                 if (folio_test_pmd_mappable(folio))
> >                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > +               count_mthp_stat(folio_order(folio),
> > +                               MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >                 ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
> > --
> > 2.33.1
> >
> >
Barry Song June 30, 2024, 9:48 a.m. UTC | #7
On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> + Barry
>
> On 24/04/2024 14:51, Lance Yang wrote:
> > At present, the split counters in THP statistics no longer include
> > PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> > counters to monitor the frequency of mTHP splits. This will assist
> > developers in better analyzing and optimizing system performance.
> >
> > /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >         split_page
> >         split_page_failed
> >         deferred_split_page
> >
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >  include/linux/huge_mm.h |  3 +++
> >  mm/huge_memory.c        | 14 ++++++++++++--
> >  2 files changed, 15 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 56c7ea73090b..7b9c6590e1f7 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >       MTHP_STAT_ANON_SWPOUT,
> >       MTHP_STAT_ANON_SWPOUT_FALLBACK,
> > +     MTHP_STAT_SPLIT_PAGE,
> > +     MTHP_STAT_SPLIT_PAGE_FAILED,
> > +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >       __MTHP_STAT_COUNT
> >  };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 055df5aac7c3..52db888e47a6 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> > +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> > +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> > +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >
> >  static struct attribute *stats_attrs[] = {
> >       &anon_fault_alloc_attr.attr,
> > @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >       &anon_fault_fallback_charge_attr.attr,
> >       &anon_swpout_attr.attr,
> >       &anon_swpout_fallback_attr.attr,
> > +     &split_page_attr.attr,
> > +     &split_page_failed_attr.attr,
> > +     &deferred_split_page_attr.attr,
> >       NULL,
> >  };
> >
> > @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >       struct anon_vma *anon_vma = NULL;
> >       struct address_space *mapping = NULL;
> > -     bool is_thp = folio_test_pmd_mappable(folio);
> > +     int order = folio_order(folio);
> >       int extra_pins, ret;
> >       pgoff_t end;
> >       bool is_hzp;
> > @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >               i_mmap_unlock_read(mapping);
> >  out:
> >       xas_destroy(&xas);
> > -     if (is_thp)
> > +     if (order >= HPAGE_PMD_ORDER)
> >               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> > +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> > +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> >       return ret;
> >  }
> >
> > @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >       if (list_empty(&folio->_deferred_list)) {
> >               if (folio_test_pmd_mappable(folio))
> >                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > +             count_mthp_stat(folio_order(folio),
> > +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
>
> There is a very long conversation with Barry about adding a 'global "mTHP became
> partially mapped 1 or more processes" counter (inc only)', which terminates at
> [1]. There is a lot of discussion about the required semantics around the need
> for partial map to cover alignment and contiguity as well as whether all pages
> are mapped, and to trigger once it becomes partial in at least 1 process.
>
> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> information as a result. Barry, what's your view here? I'm guessing this doesn't
> quite solve what you are looking for?

This doesn't quite solve what I am looking for but I still think the
patch has its value.

I'm looking for a solution that can:

  *  Count the amount of memory in the system for each mTHP size.
  *  Determine how much memory for each mTHP size is partially unmapped.

For example, in a system with 16GB of memory, we might find that we have 3GB
of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
memory at this moment.  I'm uncertain whether Lance is interested in
this job :-)

Counting deferred_split remains valuable as it can signal whether the system is
experiencing significant partial unmapping.

>
> [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
>
> Thanks,
> Ryan
>
> >               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >               ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
>

Thanks
Barry
Lance Yang June 30, 2024, 11:34 a.m. UTC | #8
Hi Barry,

Thanks for following up!

On Sun, Jun 30, 2024 at 5:48 PM Barry Song <baohua@kernel.org> wrote:
>
> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >
> > + Barry
> >
> > On 24/04/2024 14:51, Lance Yang wrote:
> > > At present, the split counters in THP statistics no longer include
> > > PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> > > counters to monitor the frequency of mTHP splits. This will assist
> > > developers in better analyzing and optimizing system performance.
> > >
> > > /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> > >         split_page
> > >         split_page_failed
> > >         deferred_split_page
> > >
> > > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > > ---
> > >  include/linux/huge_mm.h |  3 +++
> > >  mm/huge_memory.c        | 14 ++++++++++++--
> > >  2 files changed, 15 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > > index 56c7ea73090b..7b9c6590e1f7 100644
> > > --- a/include/linux/huge_mm.h
> > > +++ b/include/linux/huge_mm.h
> > > @@ -272,6 +272,9 @@ enum mthp_stat_item {
> > >       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> > >       MTHP_STAT_ANON_SWPOUT,
> > >       MTHP_STAT_ANON_SWPOUT_FALLBACK,
> > > +     MTHP_STAT_SPLIT_PAGE,
> > > +     MTHP_STAT_SPLIT_PAGE_FAILED,
> > > +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> > >       __MTHP_STAT_COUNT
> > >  };
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 055df5aac7c3..52db888e47a6 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> > >  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > >  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> > >  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> > > +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> > > +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> > > +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> > >
> > >  static struct attribute *stats_attrs[] = {
> > >       &anon_fault_alloc_attr.attr,
> > > @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> > >       &anon_fault_fallback_charge_attr.attr,
> > >       &anon_swpout_attr.attr,
> > >       &anon_swpout_fallback_attr.attr,
> > > +     &split_page_attr.attr,
> > > +     &split_page_failed_attr.attr,
> > > +     &deferred_split_page_attr.attr,
> > >       NULL,
> > >  };
> > >
> > > @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> > >       struct anon_vma *anon_vma = NULL;
> > >       struct address_space *mapping = NULL;
> > > -     bool is_thp = folio_test_pmd_mappable(folio);
> > > +     int order = folio_order(folio);
> > >       int extra_pins, ret;
> > >       pgoff_t end;
> > >       bool is_hzp;
> > > @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >               i_mmap_unlock_read(mapping);
> > >  out:
> > >       xas_destroy(&xas);
> > > -     if (is_thp)
> > > +     if (order >= HPAGE_PMD_ORDER)
> > >               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> > > +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> > > +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> > >       return ret;
> > >  }
> > >
> > > @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> > >       if (list_empty(&folio->_deferred_list)) {
> > >               if (folio_test_pmd_mappable(folio))
> > >                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > > +             count_mthp_stat(folio_order(folio),
> > > +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >
> > There is a very long conversation with Barry about adding a 'global "mTHP became
> > partially mapped 1 or more processes" counter (inc only)', which terminates at
> > [1]. There is a lot of discussion about the required semantics around the need
> > for partial map to cover alignment and contiguity as well as whether all pages
> > are mapped, and to trigger once it becomes partial in at least 1 process.
> >
> > MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> > information as a result. Barry, what's your view here? I'm guessing this doesn't
> > quite solve what you are looking for?
>
> This doesn't quite solve what I am looking for but I still think the
> patch has its value.
>
> I'm looking for a solution that can:
>
>   *  Count the amount of memory in the system for each mTHP size.
>   *  Determine how much memory for each mTHP size is partially unmapped.
>
> For example, in a system with 16GB of memory, we might find that we have 3GB
> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> memory at this moment.  I'm uncertain whether Lance is interested in
> this job :-)

Nice, that's an interesting/valuable job for me ;)

Let's do it separately, as 'split' and friends probably can’t be the
solution you
mentioned above, IMHO.

Hmm... I don't have a good idea about the solution for now, but will
think it over
and come back to discuss it here.

>
> Counting deferred_split remains valuable as it can signal whether the system is
> experiencing significant partial unmapping.

Have a nice weekend!
Lance

>
> >
> > [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
> >
> > Thanks,
> > Ryan
> >
> > >               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> > >               ds_queue->split_queue_len++;
> > >  #ifdef CONFIG_MEMCG
> >
>
> Thanks
> Barry
Ryan Roberts July 1, 2024, 8:16 a.m. UTC | #9
On 30/06/2024 12:34, Lance Yang wrote:
> Hi Barry,
> 
> Thanks for following up!
> 
> On Sun, Jun 30, 2024 at 5:48 PM Barry Song <baohua@kernel.org> wrote:
>>
>> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> + Barry
>>>
>>> On 24/04/2024 14:51, Lance Yang wrote:
>>>> At present, the split counters in THP statistics no longer include
>>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
>>>> counters to monitor the frequency of mTHP splits. This will assist
>>>> developers in better analyzing and optimizing system performance.
>>>>
>>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>>>>         split_page
>>>>         split_page_failed
>>>>         deferred_split_page
>>>>
>>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>>> ---
>>>>  include/linux/huge_mm.h |  3 +++
>>>>  mm/huge_memory.c        | 14 ++++++++++++--
>>>>  2 files changed, 15 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 56c7ea73090b..7b9c6590e1f7 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>>>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>       MTHP_STAT_ANON_SWPOUT,
>>>>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
>>>> +     MTHP_STAT_SPLIT_PAGE,
>>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
>>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
>>>>       __MTHP_STAT_COUNT
>>>>  };
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 055df5aac7c3..52db888e47a6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
>>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
>>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
>>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>>
>>>>  static struct attribute *stats_attrs[] = {
>>>>       &anon_fault_alloc_attr.attr,
>>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>>>>       &anon_fault_fallback_charge_attr.attr,
>>>>       &anon_swpout_attr.attr,
>>>>       &anon_swpout_fallback_attr.attr,
>>>> +     &split_page_attr.attr,
>>>> +     &split_page_failed_attr.attr,
>>>> +     &deferred_split_page_attr.attr,
>>>>       NULL,
>>>>  };
>>>>
>>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>>>>       struct anon_vma *anon_vma = NULL;
>>>>       struct address_space *mapping = NULL;
>>>> -     bool is_thp = folio_test_pmd_mappable(folio);
>>>> +     int order = folio_order(folio);
>>>>       int extra_pins, ret;
>>>>       pgoff_t end;
>>>>       bool is_hzp;
>>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>               i_mmap_unlock_read(mapping);
>>>>  out:
>>>>       xas_destroy(&xas);
>>>> -     if (is_thp)
>>>> +     if (order >= HPAGE_PMD_ORDER)
>>>>               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
>>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
>>>>       return ret;
>>>>  }
>>>>
>>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>>>>       if (list_empty(&folio->_deferred_list)) {
>>>>               if (folio_test_pmd_mappable(folio))
>>>>                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>> +             count_mthp_stat(folio_order(folio),
>>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>
>>> There is a very long conversation with Barry about adding a 'global "mTHP became
>>> partially mapped 1 or more processes" counter (inc only)', which terminates at
>>> [1]. There is a lot of discussion about the required semantics around the need
>>> for partial map to cover alignment and contiguity as well as whether all pages
>>> are mapped, and to trigger once it becomes partial in at least 1 process.
>>>
>>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
>>> information as a result. Barry, what's your view here? I'm guessing this doesn't
>>> quite solve what you are looking for?
>>
>> This doesn't quite solve what I am looking for but I still think the
>> patch has its value.
>>
>> I'm looking for a solution that can:
>>
>>   *  Count the amount of memory in the system for each mTHP size.
>>   *  Determine how much memory for each mTHP size is partially unmapped.
>>
>> For example, in a system with 16GB of memory, we might find that we have 3GB
>> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
>> memory at this moment.  I'm uncertain whether Lance is interested in
>> this job :-)
> 
> Nice, that's an interesting/valuable job for me ;)
> 
> Let's do it separately, as 'split' and friends probably can’t be the
> solution you
> mentioned above, IMHO.
> 
> Hmm... I don't have a good idea about the solution for now, but will
> think it over
> and come back to discuss it here.

I have a grad starting in a couple of weeks and I had been planning to initially
ask him to look at this to help him get up to speed on mTHP/mm stuff. But I have
plenty of other things for him to do if Lance wants to take this :)

> 
>>
>> Counting deferred_split remains valuable as it can signal whether the system is
>> experiencing significant partial unmapping.
> 
> Have a nice weekend!
> Lance
> 
>>
>>>
>>> [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
>>>
>>> Thanks,
>>> Ryan
>>>
>>>>               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>>>               ds_queue->split_queue_len++;
>>>>  #ifdef CONFIG_MEMCG
>>>
>>
>> Thanks
>> Barry
David Hildenbrand July 1, 2024, 8:56 a.m. UTC | #10
On 30.06.24 11:48, Barry Song wrote:
> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> + Barry
>>
>> On 24/04/2024 14:51, Lance Yang wrote:
>>> At present, the split counters in THP statistics no longer include
>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
>>> counters to monitor the frequency of mTHP splits. This will assist
>>> developers in better analyzing and optimizing system performance.
>>>
>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>>>          split_page
>>>          split_page_failed
>>>          deferred_split_page
>>>
>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>> ---
>>>   include/linux/huge_mm.h |  3 +++
>>>   mm/huge_memory.c        | 14 ++++++++++++--
>>>   2 files changed, 15 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 56c7ea73090b..7b9c6590e1f7 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>>>        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>        MTHP_STAT_ANON_SWPOUT,
>>>        MTHP_STAT_ANON_SWPOUT_FALLBACK,
>>> +     MTHP_STAT_SPLIT_PAGE,
>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
>>>        __MTHP_STAT_COUNT
>>>   };
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 055df5aac7c3..52db888e47a6 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>>>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>
>>>   static struct attribute *stats_attrs[] = {
>>>        &anon_fault_alloc_attr.attr,
>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>>>        &anon_fault_fallback_charge_attr.attr,
>>>        &anon_swpout_attr.attr,
>>>        &anon_swpout_fallback_attr.attr,
>>> +     &split_page_attr.attr,
>>> +     &split_page_failed_attr.attr,
>>> +     &deferred_split_page_attr.attr,
>>>        NULL,
>>>   };
>>>
>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>        XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>>>        struct anon_vma *anon_vma = NULL;
>>>        struct address_space *mapping = NULL;
>>> -     bool is_thp = folio_test_pmd_mappable(folio);
>>> +     int order = folio_order(folio);
>>>        int extra_pins, ret;
>>>        pgoff_t end;
>>>        bool is_hzp;
>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>                i_mmap_unlock_read(mapping);
>>>   out:
>>>        xas_destroy(&xas);
>>> -     if (is_thp)
>>> +     if (order >= HPAGE_PMD_ORDER)
>>>                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
>>>        return ret;
>>>   }
>>>
>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>>>        if (list_empty(&folio->_deferred_list)) {
>>>                if (folio_test_pmd_mappable(folio))
>>>                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>> +             count_mthp_stat(folio_order(folio),
>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>
>> There is a very long conversation with Barry about adding a 'global "mTHP became
>> partially mapped 1 or more processes" counter (inc only)', which terminates at
>> [1]. There is a lot of discussion about the required semantics around the need
>> for partial map to cover alignment and contiguity as well as whether all pages
>> are mapped, and to trigger once it becomes partial in at least 1 process.
>>
>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
>> information as a result. Barry, what's your view here? I'm guessing this doesn't
>> quite solve what you are looking for?
> 
> This doesn't quite solve what I am looking for but I still think the
> patch has its value.
> 
> I'm looking for a solution that can:
> 
>    *  Count the amount of memory in the system for each mTHP size.
>    *  Determine how much memory for each mTHP size is partially unmapped.
> 
> For example, in a system with 16GB of memory, we might find that we have 3GB
> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> memory at this moment.  I'm uncertain whether Lance is interested in
> this job :-)
> 
> Counting deferred_split remains valuable as it can signal whether the system is
> experiencing significant partial unmapping.

I'll note that, especially without subpage mapcounts, in the future we 
won't have that information (how much is currently mapped) readily 
available in all cases. To obtain that information on demand, we'd have 
to scan page tables or walk the rmap.

Something to keep in mind: we don't want to introduce counters that will 
be expensive to maintain longterm.
Lance Yang July 1, 2024, 11 a.m. UTC | #11
On Mon, Jul 1, 2024 at 4:16 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 30/06/2024 12:34, Lance Yang wrote:
> > Hi Barry,
> >
> > Thanks for following up!
> >
> > On Sun, Jun 30, 2024 at 5:48 PM Barry Song <baohua@kernel.org> wrote:
> >>
> >> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> + Barry
> >>>
> >>> On 24/04/2024 14:51, Lance Yang wrote:
> >>>> At present, the split counters in THP statistics no longer include
> >>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> >>>> counters to monitor the frequency of mTHP splits. This will assist
> >>>> developers in better analyzing and optimizing system performance.
> >>>>
> >>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >>>>         split_page
> >>>>         split_page_failed
> >>>>         deferred_split_page
> >>>>
> >>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >>>> ---
> >>>>  include/linux/huge_mm.h |  3 +++
> >>>>  mm/huge_memory.c        | 14 ++++++++++++--
> >>>>  2 files changed, 15 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index 56c7ea73090b..7b9c6590e1f7 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >>>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>       MTHP_STAT_ANON_SWPOUT,
> >>>>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
> >>>> +     MTHP_STAT_SPLIT_PAGE,
> >>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
> >>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >>>>       __MTHP_STAT_COUNT
> >>>>  };
> >>>>
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 055df5aac7c3..52db888e47a6 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> >>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> >>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>>
> >>>>  static struct attribute *stats_attrs[] = {
> >>>>       &anon_fault_alloc_attr.attr,
> >>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >>>>       &anon_fault_fallback_charge_attr.attr,
> >>>>       &anon_swpout_attr.attr,
> >>>>       &anon_swpout_fallback_attr.attr,
> >>>> +     &split_page_attr.attr,
> >>>> +     &split_page_failed_attr.attr,
> >>>> +     &deferred_split_page_attr.attr,
> >>>>       NULL,
> >>>>  };
> >>>>
> >>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >>>>       struct anon_vma *anon_vma = NULL;
> >>>>       struct address_space *mapping = NULL;
> >>>> -     bool is_thp = folio_test_pmd_mappable(folio);
> >>>> +     int order = folio_order(folio);
> >>>>       int extra_pins, ret;
> >>>>       pgoff_t end;
> >>>>       bool is_hzp;
> >>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>               i_mmap_unlock_read(mapping);
> >>>>  out:
> >>>>       xas_destroy(&xas);
> >>>> -     if (is_thp)
> >>>> +     if (order >= HPAGE_PMD_ORDER)
> >>>>               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> >>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> >>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>>       return ret;
> >>>>  }
> >>>>
> >>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >>>>       if (list_empty(&folio->_deferred_list)) {
> >>>>               if (folio_test_pmd_mappable(folio))
> >>>>                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>>> +             count_mthp_stat(folio_order(folio),
> >>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>
> >>> There is a very long conversation with Barry about adding a 'global "mTHP became
> >>> partially mapped 1 or more processes" counter (inc only)', which terminates at
> >>> [1]. There is a lot of discussion about the required semantics around the need
> >>> for partial map to cover alignment and contiguity as well as whether all pages
> >>> are mapped, and to trigger once it becomes partial in at least 1 process.
> >>>
> >>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> >>> information as a result. Barry, what's your view here? I'm guessing this doesn't
> >>> quite solve what you are looking for?
> >>
> >> This doesn't quite solve what I am looking for but I still think the
> >> patch has its value.
> >>
> >> I'm looking for a solution that can:
> >>
> >>   *  Count the amount of memory in the system for each mTHP size.
> >>   *  Determine how much memory for each mTHP size is partially unmapped.
> >>
> >> For example, in a system with 16GB of memory, we might find that we have 3GB
> >> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> >> memory at this moment.  I'm uncertain whether Lance is interested in
> >> this job :-)
> >
> > Nice, that's an interesting/valuable job for me ;)
> >
> > Let's do it separately, as 'split' and friends probably can’t be the
> > solution you
> > mentioned above, IMHO.
> >
> > Hmm... I don't have a good idea about the solution for now, but will
> > think it over
> > and come back to discuss it here.
>
> I have a grad starting in a couple of weeks and I had been planning to initially
> ask him to look at this to help him get up to speed on mTHP/mm stuff. But I have
> plenty of other things for him to do if Lance wants to take this :)

I'm very happy to do that, but it doesn't have to be just me - anyone
with a better
idea can take it on ;)

Thanks,
Lance

>
> >
> >>
> >> Counting deferred_split remains valuable as it can signal whether the system is
> >> experiencing significant partial unmapping.
> >
> > Have a nice weekend!
> > Lance
> >
> >>
> >>>
> >>> [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
> >>>
> >>> Thanks,
> >>> Ryan
> >>>
> >>>>               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >>>>               ds_queue->split_queue_len++;
> >>>>  #ifdef CONFIG_MEMCG
> >>>
> >>
> >> Thanks
> >> Barry
>
Lance Yang July 1, 2024, 11:06 a.m. UTC | #12
Hi David,

On Mon, Jul 1, 2024 at 4:56 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 30.06.24 11:48, Barry Song wrote:
> > On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> + Barry
> >>
> >> On 24/04/2024 14:51, Lance Yang wrote:
> >>> At present, the split counters in THP statistics no longer include
> >>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> >>> counters to monitor the frequency of mTHP splits. This will assist
> >>> developers in better analyzing and optimizing system performance.
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >>>          split_page
> >>>          split_page_failed
> >>>          deferred_split_page
> >>>
> >>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >>> ---
> >>>   include/linux/huge_mm.h |  3 +++
> >>>   mm/huge_memory.c        | 14 ++++++++++++--
> >>>   2 files changed, 15 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index 56c7ea73090b..7b9c6590e1f7 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >>>        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>        MTHP_STAT_ANON_SWPOUT,
> >>>        MTHP_STAT_ANON_SWPOUT_FALLBACK,
> >>> +     MTHP_STAT_SPLIT_PAGE,
> >>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
> >>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >>>        __MTHP_STAT_COUNT
> >>>   };
> >>>
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index 055df5aac7c3..52db888e47a6 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> >>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> >>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> >>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>
> >>>   static struct attribute *stats_attrs[] = {
> >>>        &anon_fault_alloc_attr.attr,
> >>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >>>        &anon_fault_fallback_charge_attr.attr,
> >>>        &anon_swpout_attr.attr,
> >>>        &anon_swpout_fallback_attr.attr,
> >>> +     &split_page_attr.attr,
> >>> +     &split_page_failed_attr.attr,
> >>> +     &deferred_split_page_attr.attr,
> >>>        NULL,
> >>>   };
> >>>
> >>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>        XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >>>        struct anon_vma *anon_vma = NULL;
> >>>        struct address_space *mapping = NULL;
> >>> -     bool is_thp = folio_test_pmd_mappable(folio);
> >>> +     int order = folio_order(folio);
> >>>        int extra_pins, ret;
> >>>        pgoff_t end;
> >>>        bool is_hzp;
> >>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>                i_mmap_unlock_read(mapping);
> >>>   out:
> >>>        xas_destroy(&xas);
> >>> -     if (is_thp)
> >>> +     if (order >= HPAGE_PMD_ORDER)
> >>>                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> >>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> >>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>        return ret;
> >>>   }
> >>>
> >>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >>>        if (list_empty(&folio->_deferred_list)) {
> >>>                if (folio_test_pmd_mappable(folio))
> >>>                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>> +             count_mthp_stat(folio_order(folio),
> >>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>
> >> There is a very long conversation with Barry about adding a 'global "mTHP became
> >> partially mapped 1 or more processes" counter (inc only)', which terminates at
> >> [1]. There is a lot of discussion about the required semantics around the need
> >> for partial map to cover alignment and contiguity as well as whether all pages
> >> are mapped, and to trigger once it becomes partial in at least 1 process.
> >>
> >> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> >> information as a result. Barry, what's your view here? I'm guessing this doesn't
> >> quite solve what you are looking for?
> >
> > This doesn't quite solve what I am looking for but I still think the
> > patch has its value.
> >
> > I'm looking for a solution that can:
> >
> >    *  Count the amount of memory in the system for each mTHP size.
> >    *  Determine how much memory for each mTHP size is partially unmapped.
> >
> > For example, in a system with 16GB of memory, we might find that we have 3GB
> > of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> > memory at this moment.  I'm uncertain whether Lance is interested in
> > this job :-)
> >
> > Counting deferred_split remains valuable as it can signal whether the system is
> > experiencing significant partial unmapping.
>
> I'll note that, especially without subpage mapcounts, in the future we
> won't have that information (how much is currently mapped) readily
> available in all cases. To obtain that information on demand, we'd have
> to scan page tables or walk the rmap.

Thanks for pointing that out!

>
> Something to keep in mind: we don't want to introduce counters that will
> be expensive to maintain longterm.

I'll keep that in mind as we move forward with any new implementations.

Thanks,
Lance

>
> --
> Cheers,
>
> David / dhildenb
>
Barry Song July 1, 2024, 11:43 a.m. UTC | #13
On Mon, Jul 1, 2024 at 8:56 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 30.06.24 11:48, Barry Song wrote:
> > On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> + Barry
> >>
> >> On 24/04/2024 14:51, Lance Yang wrote:
> >>> At present, the split counters in THP statistics no longer include
> >>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> >>> counters to monitor the frequency of mTHP splits. This will assist
> >>> developers in better analyzing and optimizing system performance.
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >>>          split_page
> >>>          split_page_failed
> >>>          deferred_split_page
> >>>
> >>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >>> ---
> >>>   include/linux/huge_mm.h |  3 +++
> >>>   mm/huge_memory.c        | 14 ++++++++++++--
> >>>   2 files changed, 15 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index 56c7ea73090b..7b9c6590e1f7 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >>>        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>        MTHP_STAT_ANON_SWPOUT,
> >>>        MTHP_STAT_ANON_SWPOUT_FALLBACK,
> >>> +     MTHP_STAT_SPLIT_PAGE,
> >>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
> >>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >>>        __MTHP_STAT_COUNT
> >>>   };
> >>>
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index 055df5aac7c3..52db888e47a6 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >>>   DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> >>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> >>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> >>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>
> >>>   static struct attribute *stats_attrs[] = {
> >>>        &anon_fault_alloc_attr.attr,
> >>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >>>        &anon_fault_fallback_charge_attr.attr,
> >>>        &anon_swpout_attr.attr,
> >>>        &anon_swpout_fallback_attr.attr,
> >>> +     &split_page_attr.attr,
> >>> +     &split_page_failed_attr.attr,
> >>> +     &deferred_split_page_attr.attr,
> >>>        NULL,
> >>>   };
> >>>
> >>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>        XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >>>        struct anon_vma *anon_vma = NULL;
> >>>        struct address_space *mapping = NULL;
> >>> -     bool is_thp = folio_test_pmd_mappable(folio);
> >>> +     int order = folio_order(folio);
> >>>        int extra_pins, ret;
> >>>        pgoff_t end;
> >>>        bool is_hzp;
> >>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>                i_mmap_unlock_read(mapping);
> >>>   out:
> >>>        xas_destroy(&xas);
> >>> -     if (is_thp)
> >>> +     if (order >= HPAGE_PMD_ORDER)
> >>>                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> >>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> >>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>        return ret;
> >>>   }
> >>>
> >>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >>>        if (list_empty(&folio->_deferred_list)) {
> >>>                if (folio_test_pmd_mappable(folio))
> >>>                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>> +             count_mthp_stat(folio_order(folio),
> >>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>
> >> There is a very long conversation with Barry about adding a 'global "mTHP became
> >> partially mapped 1 or more processes" counter (inc only)', which terminates at
> >> [1]. There is a lot of discussion about the required semantics around the need
> >> for partial map to cover alignment and contiguity as well as whether all pages
> >> are mapped, and to trigger once it becomes partial in at least 1 process.
> >>
> >> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> >> information as a result. Barry, what's your view here? I'm guessing this doesn't
> >> quite solve what you are looking for?
> >
> > This doesn't quite solve what I am looking for but I still think the
> > patch has its value.
> >
> > I'm looking for a solution that can:
> >
> >    *  Count the amount of memory in the system for each mTHP size.
> >    *  Determine how much memory for each mTHP size is partially unmapped.
> >
> > For example, in a system with 16GB of memory, we might find that we have 3GB
> > of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> > memory at this moment.  I'm uncertain whether Lance is interested in
> > this job :-)
> >
> > Counting deferred_split remains valuable as it can signal whether the system is
> > experiencing significant partial unmapping.
>
> I'll note that, especially without subpage mapcounts, in the future we
> won't have that information (how much is currently mapped) readily
> available in all cases. To obtain that information on demand, we'd have
> to scan page tables or walk the rmap.

I'd like to keep things simple. We can ignore the details about how
the folio is partially
unmapped. For example, whether 15 out of 16 subpages are unmapped or just 1 is
unmapped doesn't matter. When we add a folio to the deferred_list, we
increase the
count by 1. When we remove a folio from the deferred_list (for any
reason, such as
a real split), we decrease the count by 1.

If we find that (partial_unmap mTHP * 100) / all mTHP for this size
shows a large
number, it is a strong indication that userspace needs some tuning.

>
> Something to keep in mind: we don't want to introduce counters that will
> be expensive to maintain longterm.
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
David Hildenbrand July 1, 2024, 12:21 p.m. UTC | #14
On 01.07.24 13:43, Barry Song wrote:
> On Mon, Jul 1, 2024 at 8:56 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 30.06.24 11:48, Barry Song wrote:
>>> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> + Barry
>>>>
>>>> On 24/04/2024 14:51, Lance Yang wrote:
>>>>> At present, the split counters in THP statistics no longer include
>>>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
>>>>> counters to monitor the frequency of mTHP splits. This will assist
>>>>> developers in better analyzing and optimizing system performance.
>>>>>
>>>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>>>>>           split_page
>>>>>           split_page_failed
>>>>>           deferred_split_page
>>>>>
>>>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>>>> ---
>>>>>    include/linux/huge_mm.h |  3 +++
>>>>>    mm/huge_memory.c        | 14 ++++++++++++--
>>>>>    2 files changed, 15 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index 56c7ea73090b..7b9c6590e1f7 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>>>>>         MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>         MTHP_STAT_ANON_SWPOUT,
>>>>>         MTHP_STAT_ANON_SWPOUT_FALLBACK,
>>>>> +     MTHP_STAT_SPLIT_PAGE,
>>>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
>>>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
>>>>>         __MTHP_STAT_COUNT
>>>>>    };
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 055df5aac7c3..52db888e47a6 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>    DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>    DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>>>>>    DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
>>>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
>>>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
>>>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>>>
>>>>>    static struct attribute *stats_attrs[] = {
>>>>>         &anon_fault_alloc_attr.attr,
>>>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>>>>>         &anon_fault_fallback_charge_attr.attr,
>>>>>         &anon_swpout_attr.attr,
>>>>>         &anon_swpout_fallback_attr.attr,
>>>>> +     &split_page_attr.attr,
>>>>> +     &split_page_failed_attr.attr,
>>>>> +     &deferred_split_page_attr.attr,
>>>>>         NULL,
>>>>>    };
>>>>>
>>>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>         XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>>>>>         struct anon_vma *anon_vma = NULL;
>>>>>         struct address_space *mapping = NULL;
>>>>> -     bool is_thp = folio_test_pmd_mappable(folio);
>>>>> +     int order = folio_order(folio);
>>>>>         int extra_pins, ret;
>>>>>         pgoff_t end;
>>>>>         bool is_hzp;
>>>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>                 i_mmap_unlock_read(mapping);
>>>>>    out:
>>>>>         xas_destroy(&xas);
>>>>> -     if (is_thp)
>>>>> +     if (order >= HPAGE_PMD_ORDER)
>>>>>                 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>>>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
>>>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
>>>>>         return ret;
>>>>>    }
>>>>>
>>>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>>>>>         if (list_empty(&folio->_deferred_list)) {
>>>>>                 if (folio_test_pmd_mappable(folio))
>>>>>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>>> +             count_mthp_stat(folio_order(folio),
>>>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>>
>>>> There is a very long conversation with Barry about adding a 'global "mTHP became
>>>> partially mapped 1 or more processes" counter (inc only)', which terminates at
>>>> [1]. There is a lot of discussion about the required semantics around the need
>>>> for partial map to cover alignment and contiguity as well as whether all pages
>>>> are mapped, and to trigger once it becomes partial in at least 1 process.
>>>>
>>>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
>>>> information as a result. Barry, what's your view here? I'm guessing this doesn't
>>>> quite solve what you are looking for?
>>>
>>> This doesn't quite solve what I am looking for but I still think the
>>> patch has its value.
>>>
>>> I'm looking for a solution that can:
>>>
>>>     *  Count the amount of memory in the system for each mTHP size.
>>>     *  Determine how much memory for each mTHP size is partially unmapped.
>>>
>>> For example, in a system with 16GB of memory, we might find that we have 3GB
>>> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
>>> memory at this moment.  I'm uncertain whether Lance is interested in
>>> this job :-)
>>>
>>> Counting deferred_split remains valuable as it can signal whether the system is
>>> experiencing significant partial unmapping.
>>
>> I'll note that, especially without subpage mapcounts, in the future we
>> won't have that information (how much is currently mapped) readily
>> available in all cases. To obtain that information on demand, we'd have
>> to scan page tables or walk the rmap.
> 
> I'd like to keep things simple. We can ignore the details about how
> the folio is partially
> unmapped. For example, whether 15 out of 16 subpages are unmapped or just 1 is
> unmapped doesn't matter. When we add a folio to the deferred_list, we
> increase the
> count by 1. When we remove a folio from the deferred_list (for any
> reason, such as
> a real split), we decrease the count by 1.

Yes, that's valuable and not too complicated.
Barry Song Aug. 8, 2024, 9:27 p.m. UTC | #15
On Mon, Jul 1, 2024 at 8:16 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 30/06/2024 12:34, Lance Yang wrote:
> > Hi Barry,
> >
> > Thanks for following up!
> >
> > On Sun, Jun 30, 2024 at 5:48 PM Barry Song <baohua@kernel.org> wrote:
> >>
> >> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> + Barry
> >>>
> >>> On 24/04/2024 14:51, Lance Yang wrote:
> >>>> At present, the split counters in THP statistics no longer include
> >>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
> >>>> counters to monitor the frequency of mTHP splits. This will assist
> >>>> developers in better analyzing and optimizing system performance.
> >>>>
> >>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
> >>>>         split_page
> >>>>         split_page_failed
> >>>>         deferred_split_page
> >>>>
> >>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >>>> ---
> >>>>  include/linux/huge_mm.h |  3 +++
> >>>>  mm/huge_memory.c        | 14 ++++++++++++--
> >>>>  2 files changed, 15 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index 56c7ea73090b..7b9c6590e1f7 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
> >>>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>       MTHP_STAT_ANON_SWPOUT,
> >>>>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
> >>>> +     MTHP_STAT_SPLIT_PAGE,
> >>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
> >>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
> >>>>       __MTHP_STAT_COUNT
> >>>>  };
> >>>>
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 055df5aac7c3..52db888e47a6 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> >>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> >>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
> >>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>>
> >>>>  static struct attribute *stats_attrs[] = {
> >>>>       &anon_fault_alloc_attr.attr,
> >>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
> >>>>       &anon_fault_fallback_charge_attr.attr,
> >>>>       &anon_swpout_attr.attr,
> >>>>       &anon_swpout_fallback_attr.attr,
> >>>> +     &split_page_attr.attr,
> >>>> +     &split_page_failed_attr.attr,
> >>>> +     &deferred_split_page_attr.attr,
> >>>>       NULL,
> >>>>  };
> >>>>
> >>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> >>>>       struct anon_vma *anon_vma = NULL;
> >>>>       struct address_space *mapping = NULL;
> >>>> -     bool is_thp = folio_test_pmd_mappable(folio);
> >>>> +     int order = folio_order(folio);
> >>>>       int extra_pins, ret;
> >>>>       pgoff_t end;
> >>>>       bool is_hzp;
> >>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>               i_mmap_unlock_read(mapping);
> >>>>  out:
> >>>>       xas_destroy(&xas);
> >>>> -     if (is_thp)
> >>>> +     if (order >= HPAGE_PMD_ORDER)
> >>>>               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> >>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
> >>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
> >>>>       return ret;
> >>>>  }
> >>>>
> >>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
> >>>>       if (list_empty(&folio->_deferred_list)) {
> >>>>               if (folio_test_pmd_mappable(folio))
> >>>>                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>>> +             count_mthp_stat(folio_order(folio),
> >>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
> >>>
> >>> There is a very long conversation with Barry about adding a 'global "mTHP became
> >>> partially mapped 1 or more processes" counter (inc only)', which terminates at
> >>> [1]. There is a lot of discussion about the required semantics around the need
> >>> for partial map to cover alignment and contiguity as well as whether all pages
> >>> are mapped, and to trigger once it becomes partial in at least 1 process.
> >>>
> >>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
> >>> information as a result. Barry, what's your view here? I'm guessing this doesn't
> >>> quite solve what you are looking for?
> >>
> >> This doesn't quite solve what I am looking for but I still think the
> >> patch has its value.
> >>
> >> I'm looking for a solution that can:
> >>
> >>   *  Count the amount of memory in the system for each mTHP size.
> >>   *  Determine how much memory for each mTHP size is partially unmapped.
> >>
> >> For example, in a system with 16GB of memory, we might find that we have 3GB
> >> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
> >> memory at this moment.  I'm uncertain whether Lance is interested in
> >> this job :-)
> >
> > Nice, that's an interesting/valuable job for me ;)
> >
> > Let's do it separately, as 'split' and friends probably can’t be the
> > solution you
> > mentioned above, IMHO.
> >
> > Hmm... I don't have a good idea about the solution for now, but will
> > think it over
> > and come back to discuss it here.
>
> I have a grad starting in a couple of weeks and I had been planning to initially
> ask him to look at this to help him get up to speed on mTHP/mm stuff. But I have
> plenty of other things for him to do if Lance wants to take this :)

Hi Ryan, Lance,

My performance profiling is pending on the mTHP size and partially
unmapped mTHP size issues (understanding the distribution of folio
sizes within the system), so I'm not waiting for either Ryan's grad
or Lance. I've sent an RFC for this, and both of you are CC'd:

https://lore.kernel.org/all/20240808010457.228753-1-21cnbao@gmail.com/

Apologies for not waiting.  You are still warmly welcomed to participate
in the discussion and review.

>
> >
> >>
> >> Counting deferred_split remains valuable as it can signal whether the system is
> >> experiencing significant partial unmapping.
> >
> > Have a nice weekend!
> > Lance
> >
> >>
> >>>
> >>> [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
> >>>
> >>> Thanks,
> >>> Ryan
> >>>
> >>>>               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >>>>               ds_queue->split_queue_len++;
> >>>>  #ifdef CONFIG_MEMCG
> >>>
> >>

Thanks
Barry
Ryan Roberts Aug. 9, 2024, 7:50 a.m. UTC | #16
On 08/08/2024 22:27, Barry Song wrote:
> On Mon, Jul 1, 2024 at 8:16 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 30/06/2024 12:34, Lance Yang wrote:
>>> Hi Barry,
>>>
>>> Thanks for following up!
>>>
>>> On Sun, Jun 30, 2024 at 5:48 PM Barry Song <baohua@kernel.org> wrote:
>>>>
>>>> On Thu, Apr 25, 2024 at 3:41 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>
>>>>> + Barry
>>>>>
>>>>> On 24/04/2024 14:51, Lance Yang wrote:
>>>>>> At present, the split counters in THP statistics no longer include
>>>>>> PTE-mapped mTHP. Therefore, this commit introduces per-order mTHP split
>>>>>> counters to monitor the frequency of mTHP splits. This will assist
>>>>>> developers in better analyzing and optimizing system performance.
>>>>>>
>>>>>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
>>>>>>         split_page
>>>>>>         split_page_failed
>>>>>>         deferred_split_page
>>>>>>
>>>>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>>>>> ---
>>>>>>  include/linux/huge_mm.h |  3 +++
>>>>>>  mm/huge_memory.c        | 14 ++++++++++++--
>>>>>>  2 files changed, 15 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>> index 56c7ea73090b..7b9c6590e1f7 100644
>>>>>> --- a/include/linux/huge_mm.h
>>>>>> +++ b/include/linux/huge_mm.h
>>>>>> @@ -272,6 +272,9 @@ enum mthp_stat_item {
>>>>>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>>       MTHP_STAT_ANON_SWPOUT,
>>>>>>       MTHP_STAT_ANON_SWPOUT_FALLBACK,
>>>>>> +     MTHP_STAT_SPLIT_PAGE,
>>>>>> +     MTHP_STAT_SPLIT_PAGE_FAILED,
>>>>>> +     MTHP_STAT_DEFERRED_SPLIT_PAGE,
>>>>>>       __MTHP_STAT_COUNT
>>>>>>  };
>>>>>>
>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>> index 055df5aac7c3..52db888e47a6 100644
>>>>>> --- a/mm/huge_memory.c
>>>>>> +++ b/mm/huge_memory.c
>>>>>> @@ -557,6 +557,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
>>>>>>  DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
>>>>>> +DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
>>>>>> +DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
>>>>>> +DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>>>>
>>>>>>  static struct attribute *stats_attrs[] = {
>>>>>>       &anon_fault_alloc_attr.attr,
>>>>>> @@ -564,6 +567,9 @@ static struct attribute *stats_attrs[] = {
>>>>>>       &anon_fault_fallback_charge_attr.attr,
>>>>>>       &anon_swpout_attr.attr,
>>>>>>       &anon_swpout_fallback_attr.attr,
>>>>>> +     &split_page_attr.attr,
>>>>>> +     &split_page_failed_attr.attr,
>>>>>> +     &deferred_split_page_attr.attr,
>>>>>>       NULL,
>>>>>>  };
>>>>>>
>>>>>> @@ -3083,7 +3089,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>>       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
>>>>>>       struct anon_vma *anon_vma = NULL;
>>>>>>       struct address_space *mapping = NULL;
>>>>>> -     bool is_thp = folio_test_pmd_mappable(folio);
>>>>>> +     int order = folio_order(folio);
>>>>>>       int extra_pins, ret;
>>>>>>       pgoff_t end;
>>>>>>       bool is_hzp;
>>>>>> @@ -3262,8 +3268,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>>               i_mmap_unlock_read(mapping);
>>>>>>  out:
>>>>>>       xas_destroy(&xas);
>>>>>> -     if (is_thp)
>>>>>> +     if (order >= HPAGE_PMD_ORDER)
>>>>>>               count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>>>>>> +     count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
>>>>>> +                                   MTHP_STAT_SPLIT_PAGE_FAILED);
>>>>>>       return ret;
>>>>>>  }
>>>>>>
>>>>>> @@ -3327,6 +3335,8 @@ void deferred_split_folio(struct folio *folio)
>>>>>>       if (list_empty(&folio->_deferred_list)) {
>>>>>>               if (folio_test_pmd_mappable(folio))
>>>>>>                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>>>> +             count_mthp_stat(folio_order(folio),
>>>>>> +                             MTHP_STAT_DEFERRED_SPLIT_PAGE);
>>>>>
>>>>> There is a very long conversation with Barry about adding a 'global "mTHP became
>>>>> partially mapped 1 or more processes" counter (inc only)', which terminates at
>>>>> [1]. There is a lot of discussion about the required semantics around the need
>>>>> for partial map to cover alignment and contiguity as well as whether all pages
>>>>> are mapped, and to trigger once it becomes partial in at least 1 process.
>>>>>
>>>>> MTHP_STAT_DEFERRED_SPLIT_PAGE is giving much simpler semantics, but less
>>>>> information as a result. Barry, what's your view here? I'm guessing this doesn't
>>>>> quite solve what you are looking for?
>>>>
>>>> This doesn't quite solve what I am looking for but I still think the
>>>> patch has its value.
>>>>
>>>> I'm looking for a solution that can:
>>>>
>>>>   *  Count the amount of memory in the system for each mTHP size.
>>>>   *  Determine how much memory for each mTHP size is partially unmapped.
>>>>
>>>> For example, in a system with 16GB of memory, we might find that we have 3GB
>>>> of 64KB mTHP, and within that, 512MB is partially unmapped, potentially wasting
>>>> memory at this moment.  I'm uncertain whether Lance is interested in
>>>> this job :-)
>>>
>>> Nice, that's an interesting/valuable job for me ;)
>>>
>>> Let's do it separately, as 'split' and friends probably can’t be the
>>> solution you
>>> mentioned above, IMHO.
>>>
>>> Hmm... I don't have a good idea about the solution for now, but will
>>> think it over
>>> and come back to discuss it here.
>>
>> I have a grad starting in a couple of weeks and I had been planning to initially
>> ask him to look at this to help him get up to speed on mTHP/mm stuff. But I have
>> plenty of other things for him to do if Lance wants to take this :)
> 
> Hi Ryan, Lance,
> 
> My performance profiling is pending on the mTHP size and partially
> unmapped mTHP size issues (understanding the distribution of folio
> sizes within the system), so I'm not waiting for either Ryan's grad
> or Lance. I've sent an RFC for this, and both of you are CC'd:
> 
> https://lore.kernel.org/all/20240808010457.228753-1-21cnbao@gmail.com/

Yes I saw that, I'll try to give it some review today.

> 
> Apologies for not waiting.  You are still warmly welcomed to participate
> in the discussion and review.

No problem; after this last discussion, I assumed Lance was going to work on it
so there has been no duplicated effort from my side. Glad to see it implemented.

> 
>>
>>>
>>>>
>>>> Counting deferred_split remains valuable as it can signal whether the system is
>>>> experiencing significant partial unmapping.
>>>
>>> Have a nice weekend!
>>> Lance
>>>
>>>>
>>>>>
>>>>> [1] https://lore.kernel.org/linux-mm/6cc7d781-884f-4d8f-a175-8609732b87eb@arm.com/
>>>>>
>>>>> Thanks,
>>>>> Ryan
>>>>>
>>>>>>               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>>>>>               ds_queue->split_queue_len++;
>>>>>>  #ifdef CONFIG_MEMCG
>>>>>
>>>>
> 
> Thanks
> Barry
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 56c7ea73090b..7b9c6590e1f7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -272,6 +272,9 @@  enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
 	MTHP_STAT_ANON_SWPOUT,
 	MTHP_STAT_ANON_SWPOUT_FALLBACK,
+	MTHP_STAT_SPLIT_PAGE,
+	MTHP_STAT_SPLIT_PAGE_FAILED,
+	MTHP_STAT_DEFERRED_SPLIT_PAGE,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 055df5aac7c3..52db888e47a6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -557,6 +557,9 @@  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
 DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(split_page, MTHP_STAT_SPLIT_PAGE);
+DEFINE_MTHP_STAT_ATTR(split_page_failed, MTHP_STAT_SPLIT_PAGE_FAILED);
+DEFINE_MTHP_STAT_ATTR(deferred_split_page, MTHP_STAT_DEFERRED_SPLIT_PAGE);
 
 static struct attribute *stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -564,6 +567,9 @@  static struct attribute *stats_attrs[] = {
 	&anon_fault_fallback_charge_attr.attr,
 	&anon_swpout_attr.attr,
 	&anon_swpout_fallback_attr.attr,
+	&split_page_attr.attr,
+	&split_page_failed_attr.attr,
+	&deferred_split_page_attr.attr,
 	NULL,
 };
 
@@ -3083,7 +3089,7 @@  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
 	struct anon_vma *anon_vma = NULL;
 	struct address_space *mapping = NULL;
-	bool is_thp = folio_test_pmd_mappable(folio);
+	int order = folio_order(folio);
 	int extra_pins, ret;
 	pgoff_t end;
 	bool is_hzp;
@@ -3262,8 +3268,10 @@  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		i_mmap_unlock_read(mapping);
 out:
 	xas_destroy(&xas);
-	if (is_thp)
+	if (order >= HPAGE_PMD_ORDER)
 		count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+	count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT_PAGE :
+				      MTHP_STAT_SPLIT_PAGE_FAILED);
 	return ret;
 }
 
@@ -3327,6 +3335,8 @@  void deferred_split_folio(struct folio *folio)
 	if (list_empty(&folio->_deferred_list)) {
 		if (folio_test_pmd_mappable(folio))
 			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+		count_mthp_stat(folio_order(folio),
+				MTHP_STAT_DEFERRED_SPLIT_PAGE);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG