@@ -304,6 +304,15 @@ To identify what applications are mapping file transparent huge pages, it
is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
for each mapping.
+The utilization of transparent hugepages can be viewed by reading
+``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined
+as the ratio of non zero filled 4kb pages to the total number of pages in a
+THP. The buckets are labelled by the range of total utilized 4kb pages with
+one line per utilization bucket. Each line contains the total number of
+THPs in that bucket and the total number of zero filled 4kb pages summed
+over all THPs in that bucket. The last two lines show the timestamp and
+duration respectively of the most recent scan over all of physical memory.
+
Note that reading the smaps file is expensive and reading it
frequently will incur overhead.
@@ -178,6 +178,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
+int thp_number_utilized_pages(struct page *page);
+int thp_utilization_bucket(int num_utilized_pages);
+
void prep_transhuge_page(struct page *page);
void free_transhuge_page(struct page *page);
@@ -46,6 +46,16 @@
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
+/*
+ * The number of utilization buckets THPs will be grouped in
+ * under /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_BUCKET_NR 10
+/*
+ * The number of PFNs (and hence hugepages) to scan through on each periodic
+ * run of the scanner that generates /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_SCAN_SIZE 256
/*
* By default, transparent hugepage support is disabled in order to avoid
* risking an increased memory footprint for applications that are not
@@ -71,6 +81,25 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
+static void thp_utilization_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn);
+
+struct thp_scan_info_bucket {
+ int nr_thps;
+ int nr_zero_pages;
+};
+
+struct thp_scan_info {
+ struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR];
+ struct zone *scan_zone;
+ struct timespec64 last_scan_duration;
+ struct timespec64 last_scan_time;
+ unsigned long pfn;
+};
+
+static struct thp_scan_info thp_scan_debugfs;
+static struct thp_scan_info thp_scan;
+
bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
bool smaps, bool in_pf, bool enforce_sysfs)
{
@@ -485,6 +514,7 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
+ schedule_delayed_work(&thp_utilization_work, HZ);
err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
if (err)
goto err_hzp_shrinker;
@@ -599,6 +629,11 @@ static inline bool is_transparent_hugepage(struct page *page)
page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
}
+static inline bool is_anon_transparent_hugepage(struct page *page)
+{
+ return is_transparent_hugepage(page) && PageAnon(page);
+}
+
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
@@ -649,6 +684,49 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
+int thp_number_utilized_pages(struct page *page)
+{
+ struct folio *folio;
+ unsigned long page_offset, value;
+ int thp_nr_utilized_pages = HPAGE_PMD_NR;
+ int step_size = sizeof(unsigned long);
+ bool is_all_zeroes;
+ void *kaddr;
+ int i;
+
+ if (!page || !is_anon_transparent_hugepage(page))
+ return -1;
+
+ folio = page_folio(page);
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ kaddr = kmap_local_folio(folio, i);
+ is_all_zeroes = true;
+ for (page_offset = 0; page_offset < PAGE_SIZE; page_offset += step_size) {
+ value = *(unsigned long *)(kaddr + page_offset);
+ if (value != 0) {
+ is_all_zeroes = false;
+ break;
+ }
+ }
+ if (is_all_zeroes)
+ thp_nr_utilized_pages--;
+
+ kunmap_local(kaddr);
+ }
+ return thp_nr_utilized_pages;
+}
+
+int thp_utilization_bucket(int num_utilized_pages)
+{
+ int bucket;
+
+ if (num_utilized_pages < 0 || num_utilized_pages > HPAGE_PMD_NR)
+ return -1;
+ /* Group THPs into utilization buckets */
+ bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR;
+ return min(bucket, THP_UTIL_BUCKET_NR - 1);
+}
+
static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
@@ -3174,6 +3252,42 @@ static int __init split_huge_pages_debugfs(void)
return 0;
}
late_initcall(split_huge_pages_debugfs);
+
+static int thp_utilization_show(struct seq_file *seqf, void *pos)
+{
+ int i;
+ int start;
+ int end;
+
+ for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+ start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR;
+ end = (i + 1 == THP_UTIL_BUCKET_NR)
+ ? HPAGE_PMD_NR
+ : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1);
+ /* The last bucket will need to contain 100 */
+ seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end,
+ thp_scan_debugfs.buckets[i].nr_thps,
+ thp_scan_debugfs.buckets[i].nr_zero_pages);
+ }
+ seq_printf(seqf, "Last Scan Time: %lu.%02lus\n",
+ (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec,
+ (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100)));
+
+ seq_printf(seqf, "Last Scan Duration: %lu.%02lus\n",
+ (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec,
+ (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100)));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(thp_utilization);
+
+static int __init thp_utilization_debugfs(void)
+{
+ debugfs_create_file("thp_utilization", 0200, NULL, NULL,
+ &thp_utilization_fops);
+ return 0;
+}
+late_initcall(thp_utilization_debugfs);
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -3269,3 +3383,91 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
+
+static void thp_scan_next_zone(void)
+{
+ struct timespec64 current_time;
+ int i;
+ bool update_debugfs;
+ /*
+ * THP utilization worker thread has reached the end
+ * of the memory zone. Proceed to the next zone.
+ */
+ thp_scan.scan_zone = next_zone(thp_scan.scan_zone);
+ update_debugfs = !thp_scan.scan_zone;
+ thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones
+ : thp_scan.scan_zone;
+ thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1)
+ & ~(HPAGE_PMD_SIZE - 1);
+ if (!update_debugfs)
+ return;
+ /*
+ * If the worker has scanned through all of physical
+ * memory. Then update information displayed in /sys/kernel/debug/thp_utilization
+ */
+ ktime_get_ts64(¤t_time);
+ thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time,
+ thp_scan_debugfs.last_scan_time);
+ thp_scan_debugfs.last_scan_time = current_time;
+
+ for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+ thp_scan_debugfs.buckets[i].nr_thps = thp_scan.buckets[i].nr_thps;
+ thp_scan_debugfs.buckets[i].nr_zero_pages = thp_scan.buckets[i].nr_zero_pages;
+ thp_scan.buckets[i].nr_thps = 0;
+ thp_scan.buckets[i].nr_zero_pages = 0;
+ }
+}
+
+static void thp_util_scan(unsigned long pfn_end)
+{
+ struct page *page = NULL;
+ int bucket, num_utilized_pages, current_pfn;
+ int i;
+ /*
+ * Scan through each memory zone in chunks of THP_UTIL_SCAN_SIZE
+ * PFNs every second looking for anonymous THPs.
+ */
+ for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) {
+ current_pfn = thp_scan.pfn;
+ thp_scan.pfn += HPAGE_PMD_NR;
+ if (current_pfn >= pfn_end)
+ return;
+
+ if (!pfn_valid(current_pfn))
+ continue;
+
+ page = pfn_to_page(current_pfn);
+ num_utilized_pages = thp_number_utilized_pages(page);
+ bucket = thp_utilization_bucket(num_utilized_pages);
+ if (bucket < 0)
+ continue;
+
+ thp_scan.buckets[bucket].nr_thps++;
+ thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
+ }
+}
+
+static void thp_utilization_workfn(struct work_struct *work)
+{
+ unsigned long pfn_end;
+
+ if (!thp_scan.scan_zone)
+ thp_scan.scan_zone = (first_online_pgdat())->node_zones;
+ /*
+ * Worker function that scans through all of physical memory
+ * for anonymous THPs.
+ */
+ pfn_end = (thp_scan.scan_zone->zone_start_pfn +
+ thp_scan.scan_zone->spanned_pages + HPAGE_PMD_NR - 1)
+ & ~(HPAGE_PMD_SIZE - 1);
+ /* If we have reached the end of the zone or end of physical memory
+ * move on to the next zone. Otherwise, scan the next PFNs in the
+ * current zone.
+ */
+ if (!populated_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end)
+ thp_scan_next_zone();
+ else
+ thp_util_scan(pfn_end);
+
+ schedule_delayed_work(&thp_utilization_work, HZ);
+}