[RFC,v1,1/4] mm: mTHP user controls to configure pagecache large folio sizes

Message ID	20240717071257.4141363-2-ryan.roberts@arm.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Ryan Roberts <ryan.roberts@arm.com> To: Andrew Morton <akpm@linux-foundation.org>, Hugh Dickins <hughd@google.com>, Jonathan Corbet <corbet@lwn.net>, "Matthew Wilcox (Oracle)" <willy@infradead.org>, David Hildenbrand <david@redhat.com>, Barry Song <baohua@kernel.org>, Lance Yang <ioworker0@gmail.com>, Baolin Wang <baolin.wang@linux.alibaba.com>, Gavin Shan <gshan@redhat.com>, Pankaj Raghav <kernel@pankajraghav.com>, Daniel Gomez <da.gomez@samsung.com> Cc: Ryan Roberts <ryan.roberts@arm.com>, linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 1/4] mm: mTHP user controls to configure pagecache large folio sizes Date: Wed, 17 Jul 2024 08:12:53 +0100 Message-ID: <20240717071257.4141363-2-ryan.roberts@arm.com> In-Reply-To: <20240717071257.4141363-1-ryan.roberts@arm.com> References: <20240717071257.4141363-1-ryan.roberts@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	Control folio sizes used for page cache memory \| expand [RFC,v1,0/4] Control folio sizes used for page cache memory [RFC,v1,1/4] mm: mTHP user controls to configure pagecache large folio sizes [RFC,v1,2/4] mm: Introduce "always+exec" for mTHP file_enabled control [RFC,v1,3/4] mm: Override mTHP "enabled" defaults at kernel cmdline [RFC,v1,4/4] mm: Override mTHP "file_enabled" defaults at kernel cmdline

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index d4857e457add..9f3ed504c646 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -284,6 +284,27 @@ that THP is shared. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +File-Backed Hugepages +--------------------- + +The kernel will automatically select an appropriate THP size for file-backed +memory from a set of allowed sizes. By default all THP sizes that the page cache +supports are allowed, but this set can be modified with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled + +where <size> is the hugepage size being addressed, the available sizes for which +vary by system. ``always`` adds the hugepage size to the set of allowed sizes, +and ``never`` removes the hugepage size from the set of allowed sizes. + +In some situations, constraining the allowed sizes can reduce memory +fragmentation, resulting in fewer allocation fallbacks and improved system +performance. + +Note that any changes to the allowed set of sizes only applies to future +file-backed THP allocations. + Boot parameter ============== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4f9109fcdded..19ced8192d39 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -114,6 +114,24 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +static inline int lowest_order(unsigned long orders) +{ + if (orders) + return __ffs(orders); + return -1; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, @@ -158,6 +176,12 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_file_orders_always; + +static inline unsigned long file_orders_always(void) +{ + return READ_ONCE(huge_file_orders_always); +} static inline bool hugepage_global_enabled(void) { @@ -172,17 +196,6 @@ static inline bool hugepage_global_always(void) (1<<TRANSPARENT_HUGEPAGE_FLAG); } -static inline int highest_order(unsigned long orders) -{ - return fls_long(orders) - 1; -} - -static inline int next_order(unsigned long *orders, int prev) -{ - *orders &= ~BIT(prev); - return highest_order(*orders); -} - /* * Do the below checks: * - For file vma, check if the linear page offset of vma is @@ -435,6 +448,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, #else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline unsigned long file_orders_always(void) +{ + return 0; +} + static inline bool folio_test_pmd_mappable(struct folio *folio) { return false; @@ -578,16 +596,6 @@ static inline bool thp_migration_supported(void) { return false; } - -static inline int highest_order(unsigned long orders) -{ - return 0; -} - -static inline int next_order(unsigned long *orders, int prev) -{ - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/filemap.c b/mm/filemap.c index 131d514fca29..870016fcfdde 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1922,6 +1922,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned order = FGF_GET_ORDER(fgp_flags); + unsigned long orders; int err; if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) @@ -1937,13 +1938,15 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, if (!mapping_large_folio_support(mapping)) order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + + orders = file_orders_always() | BIT(0); + orders &= BIT(order + 1) - 1; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + orders &= BIT(__ffs(index) + 1) - 1; + order = highest_order(orders); - do { + while (orders) { gfp_t alloc_gfp = gfp; err = -ENOMEM; @@ -1962,7 +1965,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, break; folio_put(folio); folio = NULL; - } while (order-- > 0); + + order = next_order(&orders, order); + }; if (err == -EEXIST) goto repeat; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26d558e3e80f..e8fe28fe9cf9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -80,6 +80,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_file_orders_always __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, @@ -525,6 +526,37 @@ static ssize_t anon_enabled_store(struct kobject *kobj, return ret; } +static ssize_t file_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_file_orders_always)) + output = "[always] never"; + else + output = "always [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t file_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) + set_bit(order, &huge_file_orders_always); + else if (sysfs_streq(buf, "never")) + clear_bit(order, &huge_file_orders_always); + else + ret = -EINVAL; + + return ret; +} + static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); @@ -537,7 +569,11 @@ static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; +static struct kobj_attribute file_enabled_attr = + __ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store); + static struct attribute *file_ctrl_attrs[] = { + &file_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif @@ -712,6 +748,13 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) */ huge_anon_orders_inherit = BIT(PMD_ORDER); + /* + * For pagecache, default to enabling all orders. powerpc's PMD_ORDER + * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time + * constant so we have to do this here. + */ + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); diff --git a/mm/readahead.c b/mm/readahead.c index 517c0be7ce66..e05f85974396 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -432,6 +432,34 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return 0; } +static int select_new_order(int old_order, int max_order, unsigned long orders) +{ + unsigned long hi_orders, lo_orders; + + /* + * Select the next order to use from the set in `orders`, while ensuring + * we don't go above max_order. Prefer the next + 1 highest allowed + * order after old_order, unless there isn't one, in which case return + * the closest allowed order, which is either the next highest allowed + * order or less than or equal to old_order. The "next + 1" skip + * behaviour is intended to allow ramping up to large folios quickly. + */ + + orders &= BIT(max_order + 1) - 1; + VM_WARN_ON(!orders); + hi_orders = orders & ~(BIT(old_order + 1) - 1); + + if (hi_orders) { + old_order = lowest_order(hi_orders); + hi_orders &= ~BIT(old_order); + if (hi_orders) + return lowest_order(hi_orders); + } + + lo_orders = orders & (BIT(old_order + 1) - 1); + return highest_order(lo_orders); +} + void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { @@ -443,17 +471,15 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); + unsigned long orders; - if (!mapping_large_folio_support(mapping) || ra->size < 4) + if (!mapping_large_folio_support(mapping)) goto fallback; limit = min(limit, index + ra->size - 1); - if (new_order < MAX_PAGECACHE_ORDER) - new_order += 2; - - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + orders = file_orders_always() | BIT(0); + new_order = select_new_order(new_order, ilog2(ra->size), orders); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); @@ -463,9 +489,10 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) + while (index + (1UL << order) - 1 > limit && + (BIT(order) & orders) == 0) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err)

[RFC,v1,1/4] mm: mTHP user controls to configure pagecache large folio sizes

Commit Message

Patch