diff mbox series

[v5,1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

Message ID 20240412073740.294272-2-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series mm: add per-order mTHP alloc and swpout counters | expand

Commit Message

Barry Song April 12, 2024, 7:37 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

Profiling a system blindly with mTHP has become challenging due to the
lack of visibility into its operations.  Presenting the success rate of
mTHP allocations appears to be pressing need.

Recently, I've been experiencing significant difficulty debugging
performance improvements and regressions without these figures.  It's
crucial for us to understand the true effectiveness of mTHP in real-world
scenarios, especially in systems with fragmented memory.

This patch establishes the framework for per-order mTHP
counters. It begins by introducing the anon_fault_alloc and
anon_fault_fallback counters. Additionally, to maintain consistency
with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
Incorporating additional counters should now be straightforward as well.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
---
 include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
 mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  3 ++
 mm/page_alloc.c         |  4 +++
 4 files changed, 119 insertions(+)

Comments

Ryan Roberts April 12, 2024, 9:26 a.m. UTC | #1
Hi Barry,

2 remaining comments - otherwise looks good. (same comments I just made in the
v4 conversation).

On 12/04/2024 08:37, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Profiling a system blindly with mTHP has become challenging due to the
> lack of visibility into its operations.  Presenting the success rate of
> mTHP allocations appears to be pressing need.
> 
> Recently, I've been experiencing significant difficulty debugging
> performance improvements and regressions without these figures.  It's
> crucial for us to understand the true effectiveness of mTHP in real-world
> scenarios, especially in systems with fragmented memory.
> 
> This patch establishes the framework for per-order mTHP
> counters. It begins by introducing the anon_fault_alloc and
> anon_fault_fallback counters. Additionally, to maintain consistency
> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> Incorporating additional counters should now be straightforward as well.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Cc: Yu Zhao <yuzhao@google.com>
> ---
>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>  mm/memory.c             |  3 ++
>  mm/page_alloc.c         |  4 +++
>  4 files changed, 119 insertions(+)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e896ca4760f6..c5beb54b97cb 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  					  enforce_sysfs, orders);
>  }
>  
> +enum mthp_stat_item {
> +	MTHP_STAT_ANON_FAULT_ALLOC,
> +	MTHP_STAT_ANON_FAULT_FALLBACK,
> +	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> +	__MTHP_STAT_COUNT
> +};
> +
> +struct mthp_stat {
> +	unsigned long stats[0][__MTHP_STAT_COUNT];
> +};
> +
> +extern struct mthp_stat __percpu *mthp_stats;
> +
> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +{
> +	if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> +		return;
> +
> +	this_cpu_inc(mthp_stats->stats[order][item]);
> +}
> +
> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> +{
> +	if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> +		return;
> +
> +	this_cpu_add(mthp_stats->stats[order][item], delta);
> +}
> +
> +/*
> + * Fold the foreign cpu mthp stats into our own.
> + *
> + * This is adding to the stats on one processor
> + * but keeps the global counts constant.
> + */
> +static inline void mthp_stats_fold_cpu(int cpu)
> +{
> +	struct mthp_stat *fold_stat;
> +	int i, j;
> +
> +	if (!mthp_stats)
> +		return;
> +	fold_stat = per_cpu_ptr(mthp_stats, cpu);
> +	for (i = 1; i <= PMD_ORDER; i++) {
> +		for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> +			count_mthp_stats(i, j, fold_stat->stats[i][j]);
> +			fold_stat->stats[i][j] = 0;
> +		}
> +	}
> +}

This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
cpus should work.

> +
>  #define transparent_hugepage_use_zero_page()				\
>  	(transparent_hugepage_flags &					\
>  	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index dc30139590e6..21c4ac74b484 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>  	.sysfs_ops = &kobj_sysfs_ops,
>  };
>  
> +struct mthp_stat __percpu *mthp_stats;
> +
> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> +{
> +	unsigned long sum = 0;
> +	int cpu;
> +
> +	cpus_read_lock();
> +	for_each_online_cpu(cpu) {
> +		struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> +
> +		sum += this->stats[order][item];
> +	}
> +	cpus_read_unlock();
> +
> +	return sum;
> +}
> +
> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)					\
> +static ssize_t _name##_show(struct kobject *kobj,			\
> +			struct kobj_attribute *attr, char *buf)		\
> +{									\
> +	int order = to_thpsize(kobj)->order;				\
> +									\
> +	return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));	\
> +}									\
> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +
> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +
> +static struct attribute *stats_attrs[] = {
> +	&anon_fault_alloc_attr.attr,
> +	&anon_fault_fallback_attr.attr,
> +	&anon_fault_fallback_charge_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group stats_attr_group = {
> +	.name = "stats",
> +	.attrs = stats_attrs,
> +};
> +
>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>  {
>  	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>  		return ERR_PTR(ret);
>  	}
>  
> +	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> +	if (ret) {
> +		kobject_put(&thpsize->kobj);
> +		return ERR_PTR(ret);
> +	}
> +
>  	thpsize->order = order;
>  	return thpsize;
>  }
> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>  	 */
>  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>  
> +	mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> +			sizeof(unsigned long));

Personally I think it would be cleaner to allocate statically using
ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.

> +	if (!mthp_stats)
> +		return -ENOMEM;
> +
>  	err = hugepage_init_sysfs(&hugepage_kobj);
>  	if (err)
>  		goto err_sysfs;
> @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
>  err_slab:
>  	hugepage_exit_sysfs(hugepage_kobj);
>  err_sysfs:
> +	free_percpu(mthp_stats);
> +	mthp_stats = NULL;
>  	return err;
>  }
>  subsys_initcall(hugepage_init);
> @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  		folio_put(folio);
>  		count_vm_event(THP_FAULT_FALLBACK);
>  		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> +		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  		return VM_FAULT_FALLBACK;
>  	}
>  	folio_throttle_swaprate(folio, gfp);
> @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  		mm_inc_nr_ptes(vma->vm_mm);
>  		spin_unlock(vmf->ptl);
>  		count_vm_event(THP_FAULT_ALLOC);
> +		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>  		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>  	}
>  
> @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>  	if (unlikely(!folio)) {
>  		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>  		return VM_FAULT_FALLBACK;
>  	}
>  	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> diff --git a/mm/memory.c b/mm/memory.c
> index 649a547fe8e3..06048af7cf9a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  		folio = vma_alloc_folio(gfp, order, vma, addr, true);
>  		if (folio) {
>  			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> +				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  				folio_put(folio);
>  				goto next;
>  			}
> @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  			return folio;
>  		}
>  next:
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>  		order = next_order(&orders, order);
>  	}
>  
> @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  
>  	folio_ref_add(folio, nr_pages - 1);
>  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
>  	folio_add_new_anon_rmap(folio, vma, addr);
>  	folio_add_lru_vma(folio, vma);
>  setpte:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b51becf03d1e..3135b5ca2457 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
>  	 */
>  	vm_events_fold_cpu(cpu);
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	mthp_stats_fold_cpu(cpu);
> +#endif
> +
>  	/*
>  	 * Zero the differential counters of the dead processor
>  	 * so that the vm statistics are consistent.
Barry Song April 12, 2024, 9:43 a.m. UTC | #2
On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Hi Barry,
>
> 2 remaining comments - otherwise looks good. (same comments I just made in the
> v4 conversation).
>
> On 12/04/2024 08:37, Barry Song wrote:
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > Profiling a system blindly with mTHP has become challenging due to the
> > lack of visibility into its operations.  Presenting the success rate of
> > mTHP allocations appears to be pressing need.
> >
> > Recently, I've been experiencing significant difficulty debugging
> > performance improvements and regressions without these figures.  It's
> > crucial for us to understand the true effectiveness of mTHP in real-world
> > scenarios, especially in systems with fragmented memory.
> >
> > This patch establishes the framework for per-order mTHP
> > counters. It begins by introducing the anon_fault_alloc and
> > anon_fault_fallback counters. Additionally, to maintain consistency
> > with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> > anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> > Incorporating additional counters should now be straightforward as well.
> >
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > Cc: Chris Li <chrisl@kernel.org>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > Cc: Kairui Song <kasong@tencent.com>
> > Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Yosry Ahmed <yosryahmed@google.com>
> > Cc: Yu Zhao <yuzhao@google.com>
> > ---
> >  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
> >  mm/memory.c             |  3 ++
> >  mm/page_alloc.c         |  4 +++
> >  4 files changed, 119 insertions(+)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e896ca4760f6..c5beb54b97cb 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >                                         enforce_sysfs, orders);
> >  }
> >
> > +enum mthp_stat_item {
> > +     MTHP_STAT_ANON_FAULT_ALLOC,
> > +     MTHP_STAT_ANON_FAULT_FALLBACK,
> > +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> > +     __MTHP_STAT_COUNT
> > +};
> > +
> > +struct mthp_stat {
> > +     unsigned long stats[0][__MTHP_STAT_COUNT];
> > +};
> > +
> > +extern struct mthp_stat __percpu *mthp_stats;
> > +
> > +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> > +{
> > +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> > +             return;
> > +
> > +     this_cpu_inc(mthp_stats->stats[order][item]);
> > +}
> > +
> > +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> > +{
> > +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> > +             return;
> > +
> > +     this_cpu_add(mthp_stats->stats[order][item], delta);
> > +}
> > +
> > +/*
> > + * Fold the foreign cpu mthp stats into our own.
> > + *
> > + * This is adding to the stats on one processor
> > + * but keeps the global counts constant.
> > + */
> > +static inline void mthp_stats_fold_cpu(int cpu)
> > +{
> > +     struct mthp_stat *fold_stat;
> > +     int i, j;
> > +
> > +     if (!mthp_stats)
> > +             return;
> > +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
> > +     for (i = 1; i <= PMD_ORDER; i++) {
> > +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> > +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
> > +                     fold_stat->stats[i][j] = 0;
> > +             }
> > +     }
> > +}
>
> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> cpus should work.
>
> > +
> >  #define transparent_hugepage_use_zero_page()                         \
> >       (transparent_hugepage_flags &                                   \
> >        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index dc30139590e6..21c4ac74b484 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >       .sysfs_ops = &kobj_sysfs_ops,
> >  };
> >
> > +struct mthp_stat __percpu *mthp_stats;
> > +
> > +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> > +{
> > +     unsigned long sum = 0;
> > +     int cpu;
> > +
> > +     cpus_read_lock();
> > +     for_each_online_cpu(cpu) {
> > +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> > +
> > +             sum += this->stats[order][item];
> > +     }
> > +     cpus_read_unlock();
> > +
> > +     return sum;
> > +}
> > +
> > +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
> > +static ssize_t _name##_show(struct kobject *kobj,                    \
> > +                     struct kobj_attribute *attr, char *buf)         \
> > +{                                                                    \
> > +     int order = to_thpsize(kobj)->order;                            \
> > +                                                                     \
> > +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
> > +}                                                                    \
> > +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> > +
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > +
> > +static struct attribute *stats_attrs[] = {
> > +     &anon_fault_alloc_attr.attr,
> > +     &anon_fault_fallback_attr.attr,
> > +     &anon_fault_fallback_charge_attr.attr,
> > +     NULL,
> > +};
> > +
> > +static struct attribute_group stats_attr_group = {
> > +     .name = "stats",
> > +     .attrs = stats_attrs,
> > +};
> > +
> >  static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >  {
> >       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> > @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >               return ERR_PTR(ret);
> >       }
> >
> > +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> > +     if (ret) {
> > +             kobject_put(&thpsize->kobj);
> > +             return ERR_PTR(ret);
> > +     }
> > +
> >       thpsize->order = order;
> >       return thpsize;
> >  }
> > @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >        */
> >       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >
> > +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> > +                     sizeof(unsigned long));
>
> Personally I think it would be cleaner to allocate statically using
> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.

Hi Ryan,

I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,

#define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)

#define MAX_PTRS_PER_PTE PTRS_PER_PTE

#define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))

while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?


Am I missing something?

>
> > +     if (!mthp_stats)
> > +             return -ENOMEM;
> > +
> >       err = hugepage_init_sysfs(&hugepage_kobj);
> >       if (err)
> >               goto err_sysfs;
> > @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
> >  err_slab:
> >       hugepage_exit_sysfs(hugepage_kobj);
> >  err_sysfs:
> > +     free_percpu(mthp_stats);
> > +     mthp_stats = NULL;
> >       return err;
> >  }
> >  subsys_initcall(hugepage_init);
> > @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> >               folio_put(folio);
> >               count_vm_event(THP_FAULT_FALLBACK);
> >               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> > +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> > +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >               return VM_FAULT_FALLBACK;
> >       }
> >       folio_throttle_swaprate(folio, gfp);
> > @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> >               mm_inc_nr_ptes(vma->vm_mm);
> >               spin_unlock(vmf->ptl);
> >               count_vm_event(THP_FAULT_ALLOC);
> > +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> >               count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> >       }
> >
> > @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> >       folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> >       if (unlikely(!folio)) {
> >               count_vm_event(THP_FAULT_FALLBACK);
> > +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> >               return VM_FAULT_FALLBACK;
> >       }
> >       return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 649a547fe8e3..06048af7cf9a 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> >               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> >               if (folio) {
> >                       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > +                             count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >                               folio_put(folio);
> >                               goto next;
> >                       }
> > @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> >                       return folio;
> >               }
> >  next:
> > +             count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> >               order = next_order(&orders, order);
> >       }
> >
> > @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >
> >       folio_ref_add(folio, nr_pages - 1);
> >       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > +     count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> >       folio_add_new_anon_rmap(folio, vma, addr);
> >       folio_add_lru_vma(folio, vma);
> >  setpte:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index b51becf03d1e..3135b5ca2457 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
> >        */
> >       vm_events_fold_cpu(cpu);
> >
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +     mthp_stats_fold_cpu(cpu);
> > +#endif
> > +
> >       /*
> >        * Zero the differential counters of the dead processor
> >        * so that the vm statistics are consistent.
>
Ryan Roberts April 12, 2024, 9:56 a.m. UTC | #3
On 12/04/2024 10:43, Barry Song wrote:
> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> Hi Barry,
>>
>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>> v4 conversation).
>>
>> On 12/04/2024 08:37, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> Profiling a system blindly with mTHP has become challenging due to the
>>> lack of visibility into its operations.  Presenting the success rate of
>>> mTHP allocations appears to be pressing need.
>>>
>>> Recently, I've been experiencing significant difficulty debugging
>>> performance improvements and regressions without these figures.  It's
>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>> scenarios, especially in systems with fragmented memory.
>>>
>>> This patch establishes the framework for per-order mTHP
>>> counters. It begins by introducing the anon_fault_alloc and
>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>> Incorporating additional counters should now be straightforward as well.
>>>
>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>> Cc: Chris Li <chrisl@kernel.org>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>>> Cc: Kairui Song <kasong@tencent.com>
>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
>>> Cc: Peter Xu <peterx@redhat.com>
>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>> Cc: Suren Baghdasaryan <surenb@google.com>
>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>> Cc: Yu Zhao <yuzhao@google.com>
>>> ---
>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>>>  mm/memory.c             |  3 ++
>>>  mm/page_alloc.c         |  4 +++
>>>  4 files changed, 119 insertions(+)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e896ca4760f6..c5beb54b97cb 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>                                         enforce_sysfs, orders);
>>>  }
>>>
>>> +enum mthp_stat_item {
>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>> +     __MTHP_STAT_COUNT
>>> +};
>>> +
>>> +struct mthp_stat {
>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
>>> +};
>>> +
>>> +extern struct mthp_stat __percpu *mthp_stats;
>>> +
>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>> +             return;
>>> +
>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
>>> +}
>>> +
>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>> +{
>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>> +             return;
>>> +
>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
>>> +}
>>> +
>>> +/*
>>> + * Fold the foreign cpu mthp stats into our own.
>>> + *
>>> + * This is adding to the stats on one processor
>>> + * but keeps the global counts constant.
>>> + */
>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>> +{
>>> +     struct mthp_stat *fold_stat;
>>> +     int i, j;
>>> +
>>> +     if (!mthp_stats)
>>> +             return;
>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>> +     for (i = 1; i <= PMD_ORDER; i++) {
>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>> +                     fold_stat->stats[i][j] = 0;
>>> +             }
>>> +     }
>>> +}
>>
>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>> cpus should work.
>>
>>> +
>>>  #define transparent_hugepage_use_zero_page()                         \
>>>       (transparent_hugepage_flags &                                   \
>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index dc30139590e6..21c4ac74b484 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>       .sysfs_ops = &kobj_sysfs_ops,
>>>  };
>>>
>>> +struct mthp_stat __percpu *mthp_stats;
>>> +
>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> +     unsigned long sum = 0;
>>> +     int cpu;
>>> +
>>> +     cpus_read_lock();
>>> +     for_each_online_cpu(cpu) {
>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>> +
>>> +             sum += this->stats[order][item];
>>> +     }
>>> +     cpus_read_unlock();
>>> +
>>> +     return sum;
>>> +}
>>> +
>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
>>> +                     struct kobj_attribute *attr, char *buf)         \
>>> +{                                                                    \
>>> +     int order = to_thpsize(kobj)->order;                            \
>>> +                                                                     \
>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
>>> +}                                                                    \
>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>> +
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> +
>>> +static struct attribute *stats_attrs[] = {
>>> +     &anon_fault_alloc_attr.attr,
>>> +     &anon_fault_fallback_attr.attr,
>>> +     &anon_fault_fallback_charge_attr.attr,
>>> +     NULL,
>>> +};
>>> +
>>> +static struct attribute_group stats_attr_group = {
>>> +     .name = "stats",
>>> +     .attrs = stats_attrs,
>>> +};
>>> +
>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>  {
>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>               return ERR_PTR(ret);
>>>       }
>>>
>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>> +     if (ret) {
>>> +             kobject_put(&thpsize->kobj);
>>> +             return ERR_PTR(ret);
>>> +     }
>>> +
>>>       thpsize->order = order;
>>>       return thpsize;
>>>  }
>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>        */
>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>
>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>> +                     sizeof(unsigned long));
>>
>> Personally I think it would be cleaner to allocate statically using
>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> 
> Hi Ryan,
> 
> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> 
> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
> 
> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> 
> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
> 
> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> 
> 
> Am I missing something?

PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:

PAGE_SIZE	PAGE_SHIFT	PTRS_PER_PTE
4K		12		512
16K		14		2048
64K		16		8192

So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE

PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)

MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
(and its equal to PTRS_PER_PTE except for powerpc).

Pretty sure the math is correct?

> 
>>
>>> +     if (!mthp_stats)
>>> +             return -ENOMEM;
>>> +
>>>       err = hugepage_init_sysfs(&hugepage_kobj);
>>>       if (err)
>>>               goto err_sysfs;
>>> @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
>>>  err_slab:
>>>       hugepage_exit_sysfs(hugepage_kobj);
>>>  err_sysfs:
>>> +     free_percpu(mthp_stats);
>>> +     mthp_stats = NULL;
>>>       return err;
>>>  }
>>>  subsys_initcall(hugepage_init);
>>> @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>>               folio_put(folio);
>>>               count_vm_event(THP_FAULT_FALLBACK);
>>>               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>> +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>               return VM_FAULT_FALLBACK;
>>>       }
>>>       folio_throttle_swaprate(folio, gfp);
>>> @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>>               mm_inc_nr_ptes(vma->vm_mm);
>>>               spin_unlock(vmf->ptl);
>>>               count_vm_event(THP_FAULT_ALLOC);
>>> +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>>               count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>       }
>>>
>>> @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>       folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>>>       if (unlikely(!folio)) {
>>>               count_vm_event(THP_FAULT_FALLBACK);
>>> +             count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>               return VM_FAULT_FALLBACK;
>>>       }
>>>       return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 649a547fe8e3..06048af7cf9a 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>               folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>               if (folio) {
>>>                       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>> +                             count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>                               folio_put(folio);
>>>                               goto next;
>>>                       }
>>> @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>                       return folio;
>>>               }
>>>  next:
>>> +             count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>               order = next_order(&orders, order);
>>>       }
>>>
>>> @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>
>>>       folio_ref_add(folio, nr_pages - 1);
>>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>> +     count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
>>>       folio_add_new_anon_rmap(folio, vma, addr);
>>>       folio_add_lru_vma(folio, vma);
>>>  setpte:
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index b51becf03d1e..3135b5ca2457 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
>>>        */
>>>       vm_events_fold_cpu(cpu);
>>>
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +     mthp_stats_fold_cpu(cpu);
>>> +#endif
>>> +
>>>       /*
>>>        * Zero the differential counters of the dead processor
>>>        * so that the vm statistics are consistent.
>>
Barry Song April 12, 2024, 10:17 a.m. UTC | #4
On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 12/04/2024 10:43, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> Hi Barry,
> >>
> >> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >> v4 conversation).
> >>
> >> On 12/04/2024 08:37, Barry Song wrote:
> >>> From: Barry Song <v-songbaohua@oppo.com>
> >>>
> >>> Profiling a system blindly with mTHP has become challenging due to the
> >>> lack of visibility into its operations.  Presenting the success rate of
> >>> mTHP allocations appears to be pressing need.
> >>>
> >>> Recently, I've been experiencing significant difficulty debugging
> >>> performance improvements and regressions without these figures.  It's
> >>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>> scenarios, especially in systems with fragmented memory.
> >>>
> >>> This patch establishes the framework for per-order mTHP
> >>> counters. It begins by introducing the anon_fault_alloc and
> >>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>> Incorporating additional counters should now be straightforward as well.
> >>>
> >>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>> Cc: Chris Li <chrisl@kernel.org>
> >>> Cc: David Hildenbrand <david@redhat.com>
> >>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >>> Cc: Kairui Song <kasong@tencent.com>
> >>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> >>> Cc: Peter Xu <peterx@redhat.com>
> >>> Cc: Ryan Roberts <ryan.roberts@arm.com>
> >>> Cc: Suren Baghdasaryan <surenb@google.com>
> >>> Cc: Yosry Ahmed <yosryahmed@google.com>
> >>> Cc: Yu Zhao <yuzhao@google.com>
> >>> ---
> >>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>  mm/memory.c             |  3 ++
> >>>  mm/page_alloc.c         |  4 +++
> >>>  4 files changed, 119 insertions(+)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index e896ca4760f6..c5beb54b97cb 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>                                         enforce_sysfs, orders);
> >>>  }
> >>>
> >>> +enum mthp_stat_item {
> >>> +     MTHP_STAT_ANON_FAULT_ALLOC,
> >>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
> >>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>> +     __MTHP_STAT_COUNT
> >>> +};
> >>> +
> >>> +struct mthp_stat {
> >>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
> >>> +};
> >>> +
> >>> +extern struct mthp_stat __percpu *mthp_stats;
> >>> +
> >>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>> +{
> >>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>> +             return;
> >>> +
> >>> +     this_cpu_inc(mthp_stats->stats[order][item]);
> >>> +}
> >>> +
> >>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>> +{
> >>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>> +             return;
> >>> +
> >>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
> >>> +}
> >>> +
> >>> +/*
> >>> + * Fold the foreign cpu mthp stats into our own.
> >>> + *
> >>> + * This is adding to the stats on one processor
> >>> + * but keeps the global counts constant.
> >>> + */
> >>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>> +{
> >>> +     struct mthp_stat *fold_stat;
> >>> +     int i, j;
> >>> +
> >>> +     if (!mthp_stats)
> >>> +             return;
> >>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>> +     for (i = 1; i <= PMD_ORDER; i++) {
> >>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>> +                     fold_stat->stats[i][j] = 0;
> >>> +             }
> >>> +     }
> >>> +}
> >>
> >> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >> cpus should work.
> >>
> >>> +
> >>>  #define transparent_hugepage_use_zero_page()                         \
> >>>       (transparent_hugepage_flags &                                   \
> >>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index dc30139590e6..21c4ac74b484 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>       .sysfs_ops = &kobj_sysfs_ops,
> >>>  };
> >>>
> >>> +struct mthp_stat __percpu *mthp_stats;
> >>> +
> >>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>> +{
> >>> +     unsigned long sum = 0;
> >>> +     int cpu;
> >>> +
> >>> +     cpus_read_lock();
> >>> +     for_each_online_cpu(cpu) {
> >>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>> +
> >>> +             sum += this->stats[order][item];
> >>> +     }
> >>> +     cpus_read_unlock();
> >>> +
> >>> +     return sum;
> >>> +}
> >>> +
> >>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
> >>> +static ssize_t _name##_show(struct kobject *kobj,                    \
> >>> +                     struct kobj_attribute *attr, char *buf)         \
> >>> +{                                                                    \
> >>> +     int order = to_thpsize(kobj)->order;                            \
> >>> +                                                                     \
> >>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
> >>> +}                                                                    \
> >>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>> +
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>> +
> >>> +static struct attribute *stats_attrs[] = {
> >>> +     &anon_fault_alloc_attr.attr,
> >>> +     &anon_fault_fallback_attr.attr,
> >>> +     &anon_fault_fallback_charge_attr.attr,
> >>> +     NULL,
> >>> +};
> >>> +
> >>> +static struct attribute_group stats_attr_group = {
> >>> +     .name = "stats",
> >>> +     .attrs = stats_attrs,
> >>> +};
> >>> +
> >>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>  {
> >>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>               return ERR_PTR(ret);
> >>>       }
> >>>
> >>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>> +     if (ret) {
> >>> +             kobject_put(&thpsize->kobj);
> >>> +             return ERR_PTR(ret);
> >>> +     }
> >>> +
> >>>       thpsize->order = order;
> >>>       return thpsize;
> >>>  }
> >>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>        */
> >>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>
> >>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>> +                     sizeof(unsigned long));
> >>
> >> Personally I think it would be cleaner to allocate statically using
> >> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >
> > Hi Ryan,
> >
> > I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >
> > #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
> >
> > #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >
> > #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
> >
> > while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >
> >
> > Am I missing something?
>
> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>
> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
> 4K              12              512
> 16K             14              2048
> 64K             16              8192
>
> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>
> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>
> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> (and its equal to PTRS_PER_PTE except for powerpc).
>
> Pretty sure the math is correct?

I am not convinced the math is correct :-)

while page size is 64KiB, the page table is as below,
PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)


+--------+--------+--------+--------+--------+--------+--------+--------+
|63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
+--------+--------+--------+--------+--------+--------+--------+--------+
 |                 |    |               |              |
 |                 |    |               |              v
 |                 |    |               |            [15:0]  in-page offset
 |                 |    |               +----------> [28:16] L3 index
 |                 |    +--------------------------> [41:29] L2 index
 |                 +-------------------------------> [47:42] L1 index (48-bit)
 |                                                   [51:42] L1 index (52-bit)
 +-------------------------------------------------> [63] TTBR0/1
 
while page size is 4KiB, the page table is as below,

+--------+--------+--------+--------+--------+--------+--------+--------+
|63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
+--------+--------+--------+--------+--------+--------+--------+--------+
 |                 |         |         |         |         |
 |                 |         |         |         |         v
 |                 |         |         |         |   [11:0]  in-page offset
 |                 |         |         |         +-> [20:12] L3 index
 |                 |         |         +-----------> [29:21] L2 index
 |                 |         +---------------------> [38:30] L1 index
 |                 +-------------------------------> [47:39] L0 index
 +-------------------------------------------------> [63] TTBR0/1

PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).

You are only correct while page size = 4KiB.
Ryan Roberts April 12, 2024, 10:25 a.m. UTC | #5
On 12/04/2024 11:17, Barry Song wrote:
> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 12/04/2024 10:43, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> Hi Barry,
>>>>
>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>> v4 conversation).
>>>>
>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>
>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>> lack of visibility into its operations.  Presenting the success rate of
>>>>> mTHP allocations appears to be pressing need.
>>>>>
>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>> performance improvements and regressions without these figures.  It's
>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>> scenarios, especially in systems with fragmented memory.
>>>>>
>>>>> This patch establishes the framework for per-order mTHP
>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>
>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>> Cc: Chris Li <chrisl@kernel.org>
>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>>>>> Cc: Kairui Song <kasong@tencent.com>
>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>>> Cc: Peter Xu <peterx@redhat.com>
>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>> Cc: Suren Baghdasaryan <surenb@google.com>
>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>>> Cc: Yu Zhao <yuzhao@google.com>
>>>>> ---
>>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>  mm/memory.c             |  3 ++
>>>>>  mm/page_alloc.c         |  4 +++
>>>>>  4 files changed, 119 insertions(+)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>                                         enforce_sysfs, orders);
>>>>>  }
>>>>>
>>>>> +enum mthp_stat_item {
>>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>> +     __MTHP_STAT_COUNT
>>>>> +};
>>>>> +
>>>>> +struct mthp_stat {
>>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>> +};
>>>>> +
>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>> +
>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>> +{
>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>> +             return;
>>>>> +
>>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
>>>>> +}
>>>>> +
>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>> +{
>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>> +             return;
>>>>> +
>>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>> + *
>>>>> + * This is adding to the stats on one processor
>>>>> + * but keeps the global counts constant.
>>>>> + */
>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>> +{
>>>>> +     struct mthp_stat *fold_stat;
>>>>> +     int i, j;
>>>>> +
>>>>> +     if (!mthp_stats)
>>>>> +             return;
>>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
>>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>> +                     fold_stat->stats[i][j] = 0;
>>>>> +             }
>>>>> +     }
>>>>> +}
>>>>
>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>> cpus should work.
>>>>
>>>>> +
>>>>>  #define transparent_hugepage_use_zero_page()                         \
>>>>>       (transparent_hugepage_flags &                                   \
>>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>       .sysfs_ops = &kobj_sysfs_ops,
>>>>>  };
>>>>>
>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>> +
>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>> +{
>>>>> +     unsigned long sum = 0;
>>>>> +     int cpu;
>>>>> +
>>>>> +     cpus_read_lock();
>>>>> +     for_each_online_cpu(cpu) {
>>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>> +
>>>>> +             sum += this->stats[order][item];
>>>>> +     }
>>>>> +     cpus_read_unlock();
>>>>> +
>>>>> +     return sum;
>>>>> +}
>>>>> +
>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
>>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
>>>>> +                     struct kobj_attribute *attr, char *buf)         \
>>>>> +{                                                                    \
>>>>> +     int order = to_thpsize(kobj)->order;                            \
>>>>> +                                                                     \
>>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
>>>>> +}                                                                    \
>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>> +
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>> +
>>>>> +static struct attribute *stats_attrs[] = {
>>>>> +     &anon_fault_alloc_attr.attr,
>>>>> +     &anon_fault_fallback_attr.attr,
>>>>> +     &anon_fault_fallback_charge_attr.attr,
>>>>> +     NULL,
>>>>> +};
>>>>> +
>>>>> +static struct attribute_group stats_attr_group = {
>>>>> +     .name = "stats",
>>>>> +     .attrs = stats_attrs,
>>>>> +};
>>>>> +
>>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>  {
>>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>               return ERR_PTR(ret);
>>>>>       }
>>>>>
>>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>> +     if (ret) {
>>>>> +             kobject_put(&thpsize->kobj);
>>>>> +             return ERR_PTR(ret);
>>>>> +     }
>>>>> +
>>>>>       thpsize->order = order;
>>>>>       return thpsize;
>>>>>  }
>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>        */
>>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>
>>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>> +                     sizeof(unsigned long));
>>>>
>>>> Personally I think it would be cleaner to allocate statically using
>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>
>>> Hi Ryan,
>>>
>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>
>>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
>>>
>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>
>>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
>>>
>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>
>>>
>>> Am I missing something?
>>
>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>
>> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
>> 4K              12              512
>> 16K             14              2048
>> 64K             16              8192
>>
>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>
>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>
>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>> (and its equal to PTRS_PER_PTE except for powerpc).
>>
>> Pretty sure the math is correct?
> 
> I am not convinced the math is correct :-)
> 
> while page size is 64KiB, the page table is as below,
> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)

1 << 13 = 8192

Right? So:

ilog2(8192) = 13

What's wrong with that?

I even checked in Python to make sure I'm not going mad:

>>> import math
>>> math.log2(8192)
13.0

> 
> 
> +--------+--------+--------+--------+--------+--------+--------+--------+
> |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
> +--------+--------+--------+--------+--------+--------+--------+--------+
>  |                 |    |               |              |
>  |                 |    |               |              v
>  |                 |    |               |            [15:0]  in-page offset
>  |                 |    |               +----------> [28:16] L3 index
>  |                 |    +--------------------------> [41:29] L2 index
>  |                 +-------------------------------> [47:42] L1 index (48-bit)
>  |                                                   [51:42] L1 index (52-bit)
>  +-------------------------------------------------> [63] TTBR0/1
>  
> while page size is 4KiB, the page table is as below,
> 
> +--------+--------+--------+--------+--------+--------+--------+--------+
> |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
> +--------+--------+--------+--------+--------+--------+--------+--------+
>  |                 |         |         |         |         |
>  |                 |         |         |         |         v
>  |                 |         |         |         |   [11:0]  in-page offset
>  |                 |         |         |         +-> [20:12] L3 index
>  |                 |         |         +-----------> [29:21] L2 index
>  |                 |         +---------------------> [38:30] L1 index
>  |                 +-------------------------------> [47:39] L0 index
>  +-------------------------------------------------> [63] TTBR0/1
> 
> PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
> 
> You are only correct while page size = 4KiB.
> 
> 
> 
>
Barry Song April 12, 2024, 10:29 a.m. UTC | #6
On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 12/04/2024 11:17, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 12/04/2024 10:43, Barry Song wrote:
> >>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>
> >>>> Hi Barry,
> >>>>
> >>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >>>> v4 conversation).
> >>>>
> >>>> On 12/04/2024 08:37, Barry Song wrote:
> >>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>
> >>>>> Profiling a system blindly with mTHP has become challenging due to the
> >>>>> lack of visibility into its operations.  Presenting the success rate of
> >>>>> mTHP allocations appears to be pressing need.
> >>>>>
> >>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>> performance improvements and regressions without these figures.  It's
> >>>>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>>>> scenarios, especially in systems with fragmented memory.
> >>>>>
> >>>>> This patch establishes the framework for per-order mTHP
> >>>>> counters. It begins by introducing the anon_fault_alloc and
> >>>>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>>>> Incorporating additional counters should now be straightforward as well.
> >>>>>
> >>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>> Cc: Chris Li <chrisl@kernel.org>
> >>>>> Cc: David Hildenbrand <david@redhat.com>
> >>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >>>>> Cc: Kairui Song <kasong@tencent.com>
> >>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> >>>>> Cc: Peter Xu <peterx@redhat.com>
> >>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
> >>>>> Cc: Suren Baghdasaryan <surenb@google.com>
> >>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
> >>>>> Cc: Yu Zhao <yuzhao@google.com>
> >>>>> ---
> >>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>>>  mm/memory.c             |  3 ++
> >>>>>  mm/page_alloc.c         |  4 +++
> >>>>>  4 files changed, 119 insertions(+)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index e896ca4760f6..c5beb54b97cb 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>>>                                         enforce_sysfs, orders);
> >>>>>  }
> >>>>>
> >>>>> +enum mthp_stat_item {
> >>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
> >>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
> >>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>> +     __MTHP_STAT_COUNT
> >>>>> +};
> >>>>> +
> >>>>> +struct mthp_stat {
> >>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
> >>>>> +};
> >>>>> +
> >>>>> +extern struct mthp_stat __percpu *mthp_stats;
> >>>>> +
> >>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>>> +{
> >>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>> +             return;
> >>>>> +
> >>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
> >>>>> +}
> >>>>> +
> >>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>>>> +{
> >>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>> +             return;
> >>>>> +
> >>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Fold the foreign cpu mthp stats into our own.
> >>>>> + *
> >>>>> + * This is adding to the stats on one processor
> >>>>> + * but keeps the global counts constant.
> >>>>> + */
> >>>>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>>>> +{
> >>>>> +     struct mthp_stat *fold_stat;
> >>>>> +     int i, j;
> >>>>> +
> >>>>> +     if (!mthp_stats)
> >>>>> +             return;
> >>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
> >>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>>>> +                     fold_stat->stats[i][j] = 0;
> >>>>> +             }
> >>>>> +     }
> >>>>> +}
> >>>>
> >>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >>>> cpus should work.
> >>>>
> >>>>> +
> >>>>>  #define transparent_hugepage_use_zero_page()                         \
> >>>>>       (transparent_hugepage_flags &                                   \
> >>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>> index dc30139590e6..21c4ac74b484 100644
> >>>>> --- a/mm/huge_memory.c
> >>>>> +++ b/mm/huge_memory.c
> >>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>>>       .sysfs_ops = &kobj_sysfs_ops,
> >>>>>  };
> >>>>>
> >>>>> +struct mthp_stat __percpu *mthp_stats;
> >>>>> +
> >>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>>>> +{
> >>>>> +     unsigned long sum = 0;
> >>>>> +     int cpu;
> >>>>> +
> >>>>> +     cpus_read_lock();
> >>>>> +     for_each_online_cpu(cpu) {
> >>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>>>> +
> >>>>> +             sum += this->stats[order][item];
> >>>>> +     }
> >>>>> +     cpus_read_unlock();
> >>>>> +
> >>>>> +     return sum;
> >>>>> +}
> >>>>> +
> >>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
> >>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
> >>>>> +                     struct kobj_attribute *attr, char *buf)         \
> >>>>> +{                                                                    \
> >>>>> +     int order = to_thpsize(kobj)->order;                            \
> >>>>> +                                                                     \
> >>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
> >>>>> +}                                                                    \
> >>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>>>> +
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>> +
> >>>>> +static struct attribute *stats_attrs[] = {
> >>>>> +     &anon_fault_alloc_attr.attr,
> >>>>> +     &anon_fault_fallback_attr.attr,
> >>>>> +     &anon_fault_fallback_charge_attr.attr,
> >>>>> +     NULL,
> >>>>> +};
> >>>>> +
> >>>>> +static struct attribute_group stats_attr_group = {
> >>>>> +     .name = "stats",
> >>>>> +     .attrs = stats_attrs,
> >>>>> +};
> >>>>> +
> >>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>  {
> >>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>               return ERR_PTR(ret);
> >>>>>       }
> >>>>>
> >>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>>>> +     if (ret) {
> >>>>> +             kobject_put(&thpsize->kobj);
> >>>>> +             return ERR_PTR(ret);
> >>>>> +     }
> >>>>> +
> >>>>>       thpsize->order = order;
> >>>>>       return thpsize;
> >>>>>  }
> >>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>>>        */
> >>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>>>
> >>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>>>> +                     sizeof(unsigned long));
> >>>>
> >>>> Personally I think it would be cleaner to allocate statically using
> >>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >>>
> >>> Hi Ryan,
> >>>
> >>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >>>
> >>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
> >>>
> >>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >>>
> >>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
> >>>
> >>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >>>
> >>>
> >>> Am I missing something?
> >>
> >> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
> >>
> >> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
> >> 4K              12              512
> >> 16K             14              2048
> >> 64K             16              8192
> >>
> >> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
> >>
> >> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
> >>
> >> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> >> (and its equal to PTRS_PER_PTE except for powerpc).
> >>
> >> Pretty sure the math is correct?
> >
> > I am not convinced the math is correct :-)
> >
> > while page size is 64KiB, the page table is as below,
> > PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>
> 1 << 13 = 8192
>
> Right? So:
>
> ilog2(8192) = 13
>
> What's wrong with that?
>
> I even checked in Python to make sure I'm not going mad:
>
> >>> import math
> >>> math.log2(8192)
> 13.0

You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.

>
> >
> >
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> >  |                 |    |               |              |
> >  |                 |    |               |              v
> >  |                 |    |               |            [15:0]  in-page offset
> >  |                 |    |               +----------> [28:16] L3 index
> >  |                 |    +--------------------------> [41:29] L2 index
> >  |                 +-------------------------------> [47:42] L1 index (48-bit)
> >  |                                                   [51:42] L1 index (52-bit)
> >  +-------------------------------------------------> [63] TTBR0/1
> >
> > while page size is 4KiB, the page table is as below,
> >
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> >  |                 |         |         |         |         |
> >  |                 |         |         |         |         v
> >  |                 |         |         |         |   [11:0]  in-page offset
> >  |                 |         |         |         +-> [20:12] L3 index
> >  |                 |         |         +-----------> [29:21] L2 index
> >  |                 |         +---------------------> [38:30] L1 index
> >  |                 +-------------------------------> [47:39] L0 index
> >  +-------------------------------------------------> [63] TTBR0/1
> >
> > PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
> >
> > You are only correct while page size = 4KiB.
> >
> >
> >
> >
>
Ryan Roberts April 12, 2024, 10:38 a.m. UTC | #7
On 12/04/2024 11:29, Barry Song wrote:
> On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 12/04/2024 11:17, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> On 12/04/2024 10:43, Barry Song wrote:
>>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> Hi Barry,
>>>>>>
>>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>>>> v4 conversation).
>>>>>>
>>>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>
>>>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>>>> lack of visibility into its operations.  Presenting the success rate of
>>>>>>> mTHP allocations appears to be pressing need.
>>>>>>>
>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>> performance improvements and regressions without these figures.  It's
>>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>>>> scenarios, especially in systems with fragmented memory.
>>>>>>>
>>>>>>> This patch establishes the framework for per-order mTHP
>>>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>>>
>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>> Cc: Chris Li <chrisl@kernel.org>
>>>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>>>>>>> Cc: Kairui Song <kasong@tencent.com>
>>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>>>>> Cc: Peter Xu <peterx@redhat.com>
>>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>>>> Cc: Suren Baghdasaryan <surenb@google.com>
>>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>>>>> Cc: Yu Zhao <yuzhao@google.com>
>>>>>>> ---
>>>>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>  mm/memory.c             |  3 ++
>>>>>>>  mm/page_alloc.c         |  4 +++
>>>>>>>  4 files changed, 119 insertions(+)
>>>>>>>
>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>>>                                         enforce_sysfs, orders);
>>>>>>>  }
>>>>>>>
>>>>>>> +enum mthp_stat_item {
>>>>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
>>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>>> +     __MTHP_STAT_COUNT
>>>>>>> +};
>>>>>>> +
>>>>>>> +struct mthp_stat {
>>>>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>>>> +};
>>>>>>> +
>>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>>>> +
>>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>> +{
>>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>> +             return;
>>>>>>> +
>>>>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>>>> +{
>>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>> +             return;
>>>>>>> +
>>>>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>>>> + *
>>>>>>> + * This is adding to the stats on one processor
>>>>>>> + * but keeps the global counts constant.
>>>>>>> + */
>>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>>>> +{
>>>>>>> +     struct mthp_stat *fold_stat;
>>>>>>> +     int i, j;
>>>>>>> +
>>>>>>> +     if (!mthp_stats)
>>>>>>> +             return;
>>>>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
>>>>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>>>> +                     fold_stat->stats[i][j] = 0;
>>>>>>> +             }
>>>>>>> +     }
>>>>>>> +}
>>>>>>
>>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>>>> cpus should work.
>>>>>>
>>>>>>> +
>>>>>>>  #define transparent_hugepage_use_zero_page()                         \
>>>>>>>       (transparent_hugepage_flags &                                   \
>>>>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>>>> --- a/mm/huge_memory.c
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>>>       .sysfs_ops = &kobj_sysfs_ops,
>>>>>>>  };
>>>>>>>
>>>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>>>> +
>>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>> +{
>>>>>>> +     unsigned long sum = 0;
>>>>>>> +     int cpu;
>>>>>>> +
>>>>>>> +     cpus_read_lock();
>>>>>>> +     for_each_online_cpu(cpu) {
>>>>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>>>> +
>>>>>>> +             sum += this->stats[order][item];
>>>>>>> +     }
>>>>>>> +     cpus_read_unlock();
>>>>>>> +
>>>>>>> +     return sum;
>>>>>>> +}
>>>>>>> +
>>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
>>>>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
>>>>>>> +                     struct kobj_attribute *attr, char *buf)         \
>>>>>>> +{                                                                    \
>>>>>>> +     int order = to_thpsize(kobj)->order;                            \
>>>>>>> +                                                                     \
>>>>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
>>>>>>> +}                                                                    \
>>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>>>> +
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>>> +
>>>>>>> +static struct attribute *stats_attrs[] = {
>>>>>>> +     &anon_fault_alloc_attr.attr,
>>>>>>> +     &anon_fault_fallback_attr.attr,
>>>>>>> +     &anon_fault_fallback_charge_attr.attr,
>>>>>>> +     NULL,
>>>>>>> +};
>>>>>>> +
>>>>>>> +static struct attribute_group stats_attr_group = {
>>>>>>> +     .name = "stats",
>>>>>>> +     .attrs = stats_attrs,
>>>>>>> +};
>>>>>>> +
>>>>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>  {
>>>>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>               return ERR_PTR(ret);
>>>>>>>       }
>>>>>>>
>>>>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>>>> +     if (ret) {
>>>>>>> +             kobject_put(&thpsize->kobj);
>>>>>>> +             return ERR_PTR(ret);
>>>>>>> +     }
>>>>>>> +
>>>>>>>       thpsize->order = order;
>>>>>>>       return thpsize;
>>>>>>>  }
>>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>>>        */
>>>>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>>>
>>>>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>>>> +                     sizeof(unsigned long));
>>>>>>
>>>>>> Personally I think it would be cleaner to allocate statically using
>>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>>>
>>>>> Hi Ryan,
>>>>>
>>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>>>
>>>>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
>>>>>
>>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>>>
>>>>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
>>>>>
>>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>>>
>>>>>
>>>>> Am I missing something?
>>>>
>>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>>>
>>>> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
>>>> 4K              12              512
>>>> 16K             14              2048
>>>> 64K             16              8192
>>>>
>>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>>>
>>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>>>
>>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>>>> (and its equal to PTRS_PER_PTE except for powerpc).
>>>>
>>>> Pretty sure the math is correct?
>>>
>>> I am not convinced the math is correct :-)
>>>
>>> while page size is 64KiB, the page table is as below,
>>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>>
>> 1 << 13 = 8192
>>
>> Right? So:
>>
>> ilog2(8192) = 13
>>
>> What's wrong with that?
>>
>> I even checked in Python to make sure I'm not going mad:
>>
>>>>> import math
>>>>> math.log2(8192)
>> 13.0
> 
> You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
> I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.

No worries! We got there in the end :)

Of course my suggestion relies on being able to get a compile-time constant from
ilog2(MAX_PTRS_PER_PTE). I think that should work, right?

> 
>>
>>>
>>>
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>>  |                 |    |               |              |
>>>  |                 |    |               |              v
>>>  |                 |    |               |            [15:0]  in-page offset
>>>  |                 |    |               +----------> [28:16] L3 index
>>>  |                 |    +--------------------------> [41:29] L2 index
>>>  |                 +-------------------------------> [47:42] L1 index (48-bit)
>>>  |                                                   [51:42] L1 index (52-bit)
>>>  +-------------------------------------------------> [63] TTBR0/1
>>>
>>> while page size is 4KiB, the page table is as below,
>>>
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>>  |                 |         |         |         |         |
>>>  |                 |         |         |         |         v
>>>  |                 |         |         |         |   [11:0]  in-page offset
>>>  |                 |         |         |         +-> [20:12] L3 index
>>>  |                 |         |         +-----------> [29:21] L2 index
>>>  |                 |         +---------------------> [38:30] L1 index
>>>  |                 +-------------------------------> [47:39] L0 index
>>>  +-------------------------------------------------> [63] TTBR0/1
>>>
>>> PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
>>>
>>> You are only correct while page size = 4KiB.
>>>
>>>
>>>
>>>
>>
Barry Song April 12, 2024, 10:53 a.m. UTC | #8
On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 12/04/2024 11:29, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 12/04/2024 11:17, Barry Song wrote:
> >>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>
> >>>> On 12/04/2024 10:43, Barry Song wrote:
> >>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>>
> >>>>>> Hi Barry,
> >>>>>>
> >>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >>>>>> v4 conversation).
> >>>>>>
> >>>>>> On 12/04/2024 08:37, Barry Song wrote:
> >>>>>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>>>>
> >>>>>>> Profiling a system blindly with mTHP has become challenging due to the
> >>>>>>> lack of visibility into its operations.  Presenting the success rate of
> >>>>>>> mTHP allocations appears to be pressing need.
> >>>>>>>
> >>>>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>>>> performance improvements and regressions without these figures.  It's
> >>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>>>>>> scenarios, especially in systems with fragmented memory.
> >>>>>>>
> >>>>>>> This patch establishes the framework for per-order mTHP
> >>>>>>> counters. It begins by introducing the anon_fault_alloc and
> >>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>>>>>> Incorporating additional counters should now be straightforward as well.
> >>>>>>>
> >>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>>>>> Cc: Chris Li <chrisl@kernel.org>
> >>>>>>> Cc: David Hildenbrand <david@redhat.com>
> >>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >>>>>>> Cc: Kairui Song <kasong@tencent.com>
> >>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> >>>>>>> Cc: Peter Xu <peterx@redhat.com>
> >>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
> >>>>>>> Cc: Suren Baghdasaryan <surenb@google.com>
> >>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
> >>>>>>> Cc: Yu Zhao <yuzhao@google.com>
> >>>>>>> ---
> >>>>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>  mm/memory.c             |  3 ++
> >>>>>>>  mm/page_alloc.c         |  4 +++
> >>>>>>>  4 files changed, 119 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>> index e896ca4760f6..c5beb54b97cb 100644
> >>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>>>>>                                         enforce_sysfs, orders);
> >>>>>>>  }
> >>>>>>>
> >>>>>>> +enum mthp_stat_item {
> >>>>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
> >>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
> >>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>>>> +     __MTHP_STAT_COUNT
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +struct mthp_stat {
> >>>>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
> >>>>>>> +
> >>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>>>>> +{
> >>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>>>> +             return;
> >>>>>>> +
> >>>>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>>>>>> +{
> >>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>>>> +             return;
> >>>>>>> +
> >>>>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Fold the foreign cpu mthp stats into our own.
> >>>>>>> + *
> >>>>>>> + * This is adding to the stats on one processor
> >>>>>>> + * but keeps the global counts constant.
> >>>>>>> + */
> >>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>>>>>> +{
> >>>>>>> +     struct mthp_stat *fold_stat;
> >>>>>>> +     int i, j;
> >>>>>>> +
> >>>>>>> +     if (!mthp_stats)
> >>>>>>> +             return;
> >>>>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>>>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
> >>>>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>>>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>>>>>> +                     fold_stat->stats[i][j] = 0;
> >>>>>>> +             }
> >>>>>>> +     }
> >>>>>>> +}
> >>>>>>
> >>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >>>>>> cpus should work.
> >>>>>>
> >>>>>>> +
> >>>>>>>  #define transparent_hugepage_use_zero_page()                         \
> >>>>>>>       (transparent_hugepage_flags &                                   \
> >>>>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>>> index dc30139590e6..21c4ac74b484 100644
> >>>>>>> --- a/mm/huge_memory.c
> >>>>>>> +++ b/mm/huge_memory.c
> >>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>>>>>       .sysfs_ops = &kobj_sysfs_ops,
> >>>>>>>  };
> >>>>>>>
> >>>>>>> +struct mthp_stat __percpu *mthp_stats;
> >>>>>>> +
> >>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>>>>>> +{
> >>>>>>> +     unsigned long sum = 0;
> >>>>>>> +     int cpu;
> >>>>>>> +
> >>>>>>> +     cpus_read_lock();
> >>>>>>> +     for_each_online_cpu(cpu) {
> >>>>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>>>>>> +
> >>>>>>> +             sum += this->stats[order][item];
> >>>>>>> +     }
> >>>>>>> +     cpus_read_unlock();
> >>>>>>> +
> >>>>>>> +     return sum;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
> >>>>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
> >>>>>>> +                     struct kobj_attribute *attr, char *buf)         \
> >>>>>>> +{                                                                    \
> >>>>>>> +     int order = to_thpsize(kobj)->order;                            \
> >>>>>>> +                                                                     \
> >>>>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
> >>>>>>> +}                                                                    \
> >>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>>>>>> +
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>>>> +
> >>>>>>> +static struct attribute *stats_attrs[] = {
> >>>>>>> +     &anon_fault_alloc_attr.attr,
> >>>>>>> +     &anon_fault_fallback_attr.attr,
> >>>>>>> +     &anon_fault_fallback_charge_attr.attr,
> >>>>>>> +     NULL,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +static struct attribute_group stats_attr_group = {
> >>>>>>> +     .name = "stats",
> >>>>>>> +     .attrs = stats_attrs,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>>>  {
> >>>>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>>>               return ERR_PTR(ret);
> >>>>>>>       }
> >>>>>>>
> >>>>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>>>>>> +     if (ret) {
> >>>>>>> +             kobject_put(&thpsize->kobj);
> >>>>>>> +             return ERR_PTR(ret);
> >>>>>>> +     }
> >>>>>>> +
> >>>>>>>       thpsize->order = order;
> >>>>>>>       return thpsize;
> >>>>>>>  }
> >>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>>>>>        */
> >>>>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>>>>>
> >>>>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>>>>>> +                     sizeof(unsigned long));
> >>>>>>
> >>>>>> Personally I think it would be cleaner to allocate statically using
> >>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >>>>>
> >>>>> Hi Ryan,
> >>>>>
> >>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >>>>>
> >>>>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
> >>>>>
> >>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >>>>>
> >>>>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
> >>>>>
> >>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >>>>>
> >>>>>
> >>>>> Am I missing something?
> >>>>
> >>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
> >>>>
> >>>> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
> >>>> 4K              12              512
> >>>> 16K             14              2048
> >>>> 64K             16              8192
> >>>>
> >>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
> >>>>
> >>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
> >>>>
> >>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> >>>> (and its equal to PTRS_PER_PTE except for powerpc).
> >>>>
> >>>> Pretty sure the math is correct?
> >>>
> >>> I am not convinced the math is correct :-)
> >>>
> >>> while page size is 64KiB, the page table is as below,
> >>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
> >>
> >> 1 << 13 = 8192
> >>
> >> Right? So:
> >>
> >> ilog2(8192) = 13
> >>
> >> What's wrong with that?
> >>
> >> I even checked in Python to make sure I'm not going mad:
> >>
> >>>>> import math
> >>>>> math.log2(8192)
> >> 13.0
> >
> > You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
> > I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.
>
> No worries! We got there in the end :)
>
> Of course my suggestion relies on being able to get a compile-time constant from
> ilog2(MAX_PTRS_PER_PTE). I think that should work, right?

I guess so, ilog2 can detect compile-time const, otherwise, it will find the
last (most-significant) bit set.

I've implemented the following change, and the build all passed.
Currently conducting testing.

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c5beb54b97cb..d4fdb2641070 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -272,47 +272,17 @@ enum mthp_stat_item {
 };

 struct mthp_stat {
-       unsigned long stats[0][__MTHP_STAT_COUNT];
+       unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
 };

-extern struct mthp_stat __percpu *mthp_stats;
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

 static inline void count_mthp_stat(int order, enum mthp_stat_item item)
 {
-       if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+       if (order <= 0 || order > PMD_ORDER)
                return;

-       this_cpu_inc(mthp_stats->stats[order][item]);
-}
-
-static inline void count_mthp_stats(int order, enum mthp_stat_item
item, long delta)
-{
-       if (order <= 0 || order > PMD_ORDER || !mthp_stats)
-               return;
-
-       this_cpu_add(mthp_stats->stats[order][item], delta);
-}
-
-/*
- * Fold the foreign cpu mthp stats into our own.
- *
- * This is adding to the stats on one processor
- * but keeps the global counts constant.
- */
-static inline void mthp_stats_fold_cpu(int cpu)
-{
-       struct mthp_stat *fold_stat;
-       int i, j;
-
-       if (!mthp_stats)
-               return;
-       fold_stat = per_cpu_ptr(mthp_stats, cpu);
-       for (i = 1; i <= PMD_ORDER; i++) {
-               for (j = 0; j < __MTHP_STAT_COUNT; j++) {
-                       count_mthp_stats(i, j, fold_stat->stats[i][j]);
-                       fold_stat->stats[i][j] = 0;
-               }
-       }
+       this_cpu_inc(mthp_stats.stats[order][item]);
 }

 #define transparent_hugepage_use_zero_page()                           \
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 21c4ac74b484..e88961ffc398 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
 };

-struct mthp_stat __percpu *mthp_stats;
+DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
 {
        unsigned long sum = 0;
        int cpu;

-       cpus_read_lock();
-       for_each_online_cpu(cpu) {
-               struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
+       for_each_possible_cpu(cpu) {
+               struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

                sum += this->stats[order][item];
        }
-       cpus_read_unlock();

        return sum;
 }
@@ -741,11 +739,6 @@ static int __init hugepage_init(void)
         */
        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);

-       mthp_stats = __alloc_percpu((PMD_ORDER + 1) *
sizeof(mthp_stats->stats[0]),
-                       sizeof(unsigned long));
-       if (!mthp_stats)
-               return -ENOMEM;
-
        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
                goto err_sysfs;
@@ -780,8 +773,6 @@ static int __init hugepage_init(void)
 err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
 err_sysfs:
-       free_percpu(mthp_stats);
-       mthp_stats = NULL;
        return err;
 }
 subsys_initcall(hugepage_init);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3135b5ca2457..b51becf03d1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
         */
        vm_events_fold_cpu(cpu);

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       mthp_stats_fold_cpu(cpu);
-#endif
-
        /*
         * Zero the differential counters of the dead processor
         * so that the vm statistics are consistent.


Thanks
Barry
Ryan Roberts April 12, 2024, 11:05 a.m. UTC | #9
On 12/04/2024 11:53, Barry Song wrote:
> On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 12/04/2024 11:29, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> On 12/04/2024 11:17, Barry Song wrote:
>>>>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> On 12/04/2024 10:43, Barry Song wrote:
>>>>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>>
>>>>>>>> Hi Barry,
>>>>>>>>
>>>>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>>>>>> v4 conversation).
>>>>>>>>
>>>>>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>>
>>>>>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>>>>>> lack of visibility into its operations.  Presenting the success rate of
>>>>>>>>> mTHP allocations appears to be pressing need.
>>>>>>>>>
>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>>> performance improvements and regressions without these figures.  It's
>>>>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>>>>>> scenarios, especially in systems with fragmented memory.
>>>>>>>>>
>>>>>>>>> This patch establishes the framework for per-order mTHP
>>>>>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>>>>>>> Cc: Chris Li <chrisl@kernel.org>
>>>>>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>>>>>>>>> Cc: Kairui Song <kasong@tencent.com>
>>>>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>>>>>>> Cc: Peter Xu <peterx@redhat.com>
>>>>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>>>>>> Cc: Suren Baghdasaryan <surenb@google.com>
>>>>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>>>>>>> Cc: Yu Zhao <yuzhao@google.com>
>>>>>>>>> ---
>>>>>>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>  mm/memory.c             |  3 ++
>>>>>>>>>  mm/page_alloc.c         |  4 +++
>>>>>>>>>  4 files changed, 119 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>>>>>                                         enforce_sysfs, orders);
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> +enum mthp_stat_item {
>>>>>>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
>>>>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>>>>> +     __MTHP_STAT_COUNT
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +struct mthp_stat {
>>>>>>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>>>>>> +
>>>>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>>>> +{
>>>>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>>>> +             return;
>>>>>>>>> +
>>>>>>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>>>>>> +{
>>>>>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>>>> +             return;
>>>>>>>>> +
>>>>>>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>>>>>> + *
>>>>>>>>> + * This is adding to the stats on one processor
>>>>>>>>> + * but keeps the global counts constant.
>>>>>>>>> + */
>>>>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>>>>>> +{
>>>>>>>>> +     struct mthp_stat *fold_stat;
>>>>>>>>> +     int i, j;
>>>>>>>>> +
>>>>>>>>> +     if (!mthp_stats)
>>>>>>>>> +             return;
>>>>>>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>>>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
>>>>>>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>>>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>>>>>> +                     fold_stat->stats[i][j] = 0;
>>>>>>>>> +             }
>>>>>>>>> +     }
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>>>>>> cpus should work.
>>>>>>>>
>>>>>>>>> +
>>>>>>>>>  #define transparent_hugepage_use_zero_page()                         \
>>>>>>>>>       (transparent_hugepage_flags &                                   \
>>>>>>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>>>>>> --- a/mm/huge_memory.c
>>>>>>>>> +++ b/mm/huge_memory.c
>>>>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>>>>>       .sysfs_ops = &kobj_sysfs_ops,
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>>>>>> +
>>>>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>>>> +{
>>>>>>>>> +     unsigned long sum = 0;
>>>>>>>>> +     int cpu;
>>>>>>>>> +
>>>>>>>>> +     cpus_read_lock();
>>>>>>>>> +     for_each_online_cpu(cpu) {
>>>>>>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>>>>>> +
>>>>>>>>> +             sum += this->stats[order][item];
>>>>>>>>> +     }
>>>>>>>>> +     cpus_read_unlock();
>>>>>>>>> +
>>>>>>>>> +     return sum;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
>>>>>>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
>>>>>>>>> +                     struct kobj_attribute *attr, char *buf)         \
>>>>>>>>> +{                                                                    \
>>>>>>>>> +     int order = to_thpsize(kobj)->order;                            \
>>>>>>>>> +                                                                     \
>>>>>>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
>>>>>>>>> +}                                                                    \
>>>>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>>>>>> +
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>>>>> +
>>>>>>>>> +static struct attribute *stats_attrs[] = {
>>>>>>>>> +     &anon_fault_alloc_attr.attr,
>>>>>>>>> +     &anon_fault_fallback_attr.attr,
>>>>>>>>> +     &anon_fault_fallback_charge_attr.attr,
>>>>>>>>> +     NULL,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +static struct attribute_group stats_attr_group = {
>>>>>>>>> +     .name = "stats",
>>>>>>>>> +     .attrs = stats_attrs,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>>>  {
>>>>>>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>>>               return ERR_PTR(ret);
>>>>>>>>>       }
>>>>>>>>>
>>>>>>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>>>>>> +     if (ret) {
>>>>>>>>> +             kobject_put(&thpsize->kobj);
>>>>>>>>> +             return ERR_PTR(ret);
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>>       thpsize->order = order;
>>>>>>>>>       return thpsize;
>>>>>>>>>  }
>>>>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>>>>>        */
>>>>>>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>>>>>
>>>>>>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>>>>>> +                     sizeof(unsigned long));
>>>>>>>>
>>>>>>>> Personally I think it would be cleaner to allocate statically using
>>>>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>>>>>
>>>>>>> Hi Ryan,
>>>>>>>
>>>>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>>>>>
>>>>>>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
>>>>>>>
>>>>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>>>>>
>>>>>>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
>>>>>>>
>>>>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>>>>>
>>>>>>>
>>>>>>> Am I missing something?
>>>>>>
>>>>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>>>>>
>>>>>> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
>>>>>> 4K              12              512
>>>>>> 16K             14              2048
>>>>>> 64K             16              8192
>>>>>>
>>>>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>>>>>
>>>>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>>>>>
>>>>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>>>>>> (and its equal to PTRS_PER_PTE except for powerpc).
>>>>>>
>>>>>> Pretty sure the math is correct?
>>>>>
>>>>> I am not convinced the math is correct :-)
>>>>>
>>>>> while page size is 64KiB, the page table is as below,
>>>>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>>>>
>>>> 1 << 13 = 8192
>>>>
>>>> Right? So:
>>>>
>>>> ilog2(8192) = 13
>>>>
>>>> What's wrong with that?
>>>>
>>>> I even checked in Python to make sure I'm not going mad:
>>>>
>>>>>>> import math
>>>>>>> math.log2(8192)
>>>> 13.0
>>>
>>> You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
>>> I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.
>>
>> No worries! We got there in the end :)
>>
>> Of course my suggestion relies on being able to get a compile-time constant from
>> ilog2(MAX_PTRS_PER_PTE). I think that should work, right?
> 
> I guess so, ilog2 can detect compile-time const, otherwise, it will find the
> last (most-significant) bit set.
> 
> I've implemented the following change, and the build all passed.
> Currently conducting testing.

LGTM - much cleaner!

> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index c5beb54b97cb..d4fdb2641070 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -272,47 +272,17 @@ enum mthp_stat_item {
>  };
> 
>  struct mthp_stat {
> -       unsigned long stats[0][__MTHP_STAT_COUNT];
> +       unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
>  };
> 
> -extern struct mthp_stat __percpu *mthp_stats;
> +DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
> 
>  static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>  {
> -       if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> +       if (order <= 0 || order > PMD_ORDER)
>                 return;
> 
> -       this_cpu_inc(mthp_stats->stats[order][item]);
> -}
> -
> -static inline void count_mthp_stats(int order, enum mthp_stat_item
> item, long delta)
> -{
> -       if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> -               return;
> -
> -       this_cpu_add(mthp_stats->stats[order][item], delta);
> -}
> -
> -/*
> - * Fold the foreign cpu mthp stats into our own.
> - *
> - * This is adding to the stats on one processor
> - * but keeps the global counts constant.
> - */
> -static inline void mthp_stats_fold_cpu(int cpu)
> -{
> -       struct mthp_stat *fold_stat;
> -       int i, j;
> -
> -       if (!mthp_stats)
> -               return;
> -       fold_stat = per_cpu_ptr(mthp_stats, cpu);
> -       for (i = 1; i <= PMD_ORDER; i++) {
> -               for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> -                       count_mthp_stats(i, j, fold_stat->stats[i][j]);
> -                       fold_stat->stats[i][j] = 0;
> -               }
> -       }
> +       this_cpu_inc(mthp_stats.stats[order][item]);
>  }
> 
>  #define transparent_hugepage_use_zero_page()                           \
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 21c4ac74b484..e88961ffc398 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = {
>         .sysfs_ops = &kobj_sysfs_ops,
>  };
> 
> -struct mthp_stat __percpu *mthp_stats;
> +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
> 
>  static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>  {
>         unsigned long sum = 0;
>         int cpu;
> 
> -       cpus_read_lock();
> -       for_each_online_cpu(cpu) {
> -               struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> +       for_each_possible_cpu(cpu) {
> +               struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
> 
>                 sum += this->stats[order][item];
>         }
> -       cpus_read_unlock();
> 
>         return sum;
>  }
> @@ -741,11 +739,6 @@ static int __init hugepage_init(void)
>          */
>         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> 
> -       mthp_stats = __alloc_percpu((PMD_ORDER + 1) *
> sizeof(mthp_stats->stats[0]),
> -                       sizeof(unsigned long));
> -       if (!mthp_stats)
> -               return -ENOMEM;
> -
>         err = hugepage_init_sysfs(&hugepage_kobj);
>         if (err)
>                 goto err_sysfs;
> @@ -780,8 +773,6 @@ static int __init hugepage_init(void)
>  err_slab:
>         hugepage_exit_sysfs(hugepage_kobj);
>  err_sysfs:
> -       free_percpu(mthp_stats);
> -       mthp_stats = NULL;
>         return err;
>  }
>  subsys_initcall(hugepage_init);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3135b5ca2457..b51becf03d1e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
>          */
>         vm_events_fold_cpu(cpu);
> 
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       mthp_stats_fold_cpu(cpu);
> -#endif
> -
>         /*
>          * Zero the differential counters of the dead processor
>          * so that the vm statistics are consistent.
> 
> 
> Thanks
> Barry
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..c5beb54b97cb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -264,6 +264,57 @@  unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 					  enforce_sysfs, orders);
 }
 
+enum mthp_stat_item {
+	MTHP_STAT_ANON_FAULT_ALLOC,
+	MTHP_STAT_ANON_FAULT_FALLBACK,
+	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	__MTHP_STAT_COUNT
+};
+
+struct mthp_stat {
+	unsigned long stats[0][__MTHP_STAT_COUNT];
+};
+
+extern struct mthp_stat __percpu *mthp_stats;
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+		return;
+
+	this_cpu_inc(mthp_stats->stats[order][item]);
+}
+
+static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
+{
+	if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+		return;
+
+	this_cpu_add(mthp_stats->stats[order][item], delta);
+}
+
+/*
+ * Fold the foreign cpu mthp stats into our own.
+ *
+ * This is adding to the stats on one processor
+ * but keeps the global counts constant.
+ */
+static inline void mthp_stats_fold_cpu(int cpu)
+{
+	struct mthp_stat *fold_stat;
+	int i, j;
+
+	if (!mthp_stats)
+		return;
+	fold_stat = per_cpu_ptr(mthp_stats, cpu);
+	for (i = 1; i <= PMD_ORDER; i++) {
+		for (j = 0; j < __MTHP_STAT_COUNT; j++) {
+			count_mthp_stats(i, j, fold_stat->stats[i][j]);
+			fold_stat->stats[i][j] = 0;
+		}
+	}
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dc30139590e6..21c4ac74b484 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,50 @@  static const struct kobj_type thpsize_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 
+struct mthp_stat __percpu *mthp_stats;
+
+static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
+
+		sum += this->stats[order][item];
+	}
+	cpus_read_unlock();
+
+	return sum;
+}
+
+#define DEFINE_MTHP_STAT_ATTR(_name, _index)					\
+static ssize_t _name##_show(struct kobject *kobj,			\
+			struct kobj_attribute *attr, char *buf)		\
+{									\
+	int order = to_thpsize(kobj)->order;				\
+									\
+	return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));	\
+}									\
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+
+static struct attribute *stats_attrs[] = {
+	&anon_fault_alloc_attr.attr,
+	&anon_fault_fallback_attr.attr,
+	&anon_fault_fallback_charge_attr.attr,
+	NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+	.name = "stats",
+	.attrs = stats_attrs,
+};
+
 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 {
 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +593,12 @@  static struct thpsize *thpsize_create(int order, struct kobject *parent)
 		return ERR_PTR(ret);
 	}
 
+	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+	if (ret) {
+		kobject_put(&thpsize->kobj);
+		return ERR_PTR(ret);
+	}
+
 	thpsize->order = order;
 	return thpsize;
 }
@@ -691,6 +741,11 @@  static int __init hugepage_init(void)
 	 */
 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
 
+	mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
+			sizeof(unsigned long));
+	if (!mthp_stats)
+		return -ENOMEM;
+
 	err = hugepage_init_sysfs(&hugepage_kobj);
 	if (err)
 		goto err_sysfs;
@@ -725,6 +780,8 @@  static int __init hugepage_init(void)
 err_slab:
 	hugepage_exit_sysfs(hugepage_kobj);
 err_sysfs:
+	free_percpu(mthp_stats);
+	mthp_stats = NULL;
 	return err;
 }
 subsys_initcall(hugepage_init);
@@ -880,6 +937,8 @@  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 		return VM_FAULT_FALLBACK;
 	}
 	folio_throttle_swaprate(folio, gfp);
@@ -929,6 +988,7 @@  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 	}
 
@@ -1050,6 +1110,7 @@  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
diff --git a/mm/memory.c b/mm/memory.c
index 649a547fe8e3..06048af7cf9a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4368,6 +4368,7 @@  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
 		if (folio) {
 			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 				folio_put(folio);
 				goto next;
 			}
@@ -4376,6 +4377,7 @@  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 			return folio;
 		}
 next:
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
 		order = next_order(&orders, order);
 	}
 
@@ -4485,6 +4487,7 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	folio_ref_add(folio, nr_pages - 1);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
 	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b51becf03d1e..3135b5ca2457 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5840,6 +5840,10 @@  static int page_alloc_cpu_dead(unsigned int cpu)
 	 */
 	vm_events_fold_cpu(cpu);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	mthp_stats_fold_cpu(cpu);
+#endif
+
 	/*
 	 * Zero the differential counters of the dead processor
 	 * so that the vm statistics are consistent.