[v2,2/3] mm: changes to split_huge_page() to free zero filled tail pages

Message ID	8d286a7029079c1da0b062fbb045317508409187.1665600372.git.alexlzhu@fb.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: <alexlzhu@fb.com> To: <linux-mm@kvack.org>, <kernel-team@fb.com> CC: <willy@infradead.org>, <hannes@cmpxchg.org>, <riel@surriel.com>, Alexander Zhu <alexlzhu@fb.com> Subject: [PATCH v2 2/3] mm: changes to split_huge_page() to free zero filled tail pages Date: Wed, 12 Oct 2022 11:56:07 -0700 Message-ID: <8d286a7029079c1da0b062fbb045317508409187.1665600372.git.alexlzhu@fb.com> In-Reply-To: <cover.1665600372.git.alexlzhu@fb.com> References: <cover.1665600372.git.alexlzhu@fb.com> Content-Type: text/plain Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	THP Shrinker \| expand [v2,0/3] THP Shrinker [v2,1/3] mm: add thp_utilization metrics to debugfs [v2,2/3] mm: changes to split_huge_page() to free zero filled tail pages [v2,3/3] mm: THP low utilization shrinker

diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..3f83bbcf1333 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -428,7 +428,7 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f..3618b10ddec9 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif + THP_SPLIT_FREE, + THP_SPLIT_UNMAP, + THP_SPLIT_REMAP_READONLY_ZERO_PAGE, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 29e97df37c29..a08885228cb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2451,7 +2451,7 @@ static void unmap_folio(struct folio *folio) try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) { int i = 0; @@ -2459,7 +2459,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, true, unmap_clean); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2574,6 +2574,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); + LIST_HEAD(pages_to_free); + int nr_pages_to_free = 0; int i; /* complete memcg works before add pages to LRU */ @@ -2636,7 +2638,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head)); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2650,6 +2652,33 @@ static void __split_huge_page(struct page *page, struct list_head *list, continue; unlock_page(subpage); + /* + * If a tail page has only two references left, one inherited + * from the isolation of its head and the other from + * lru_add_page_tail() which we are about to drop, it means this + * tail page was concurrently zapped. Then we can safely free it + * and save page reclaim or migration the trouble of trying it. + */ + if (list && page_ref_freeze(subpage, 2)) { + VM_BUG_ON_PAGE(PageLRU(subpage), subpage); + VM_BUG_ON_PAGE(PageCompound(subpage), subpage); + VM_BUG_ON_PAGE(page_mapped(subpage), subpage); + + ClearPageActive(subpage); + ClearPageUnevictable(subpage); + list_move(&subpage->lru, &pages_to_free); + nr_pages_to_free++; + continue; + } + /* + * If a tail page has only one reference left, it will be freed + * by the call to free_page_and_swap_cache below. Since zero + * subpages are no longer remapped, there will only be one + * reference left in cases outside of reclaim or migration. + */ + if (page_ref_count(subpage) == 1) + nr_pages_to_free++; + /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that @@ -2659,6 +2688,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, */ free_page_and_swap_cache(subpage); } + + if (!nr_pages_to_free) + return; + + mem_cgroup_uncharge_list(&pages_to_free); + free_unref_page_list(&pages_to_free); + count_vm_events(THP_SPLIT_FREE, nr_pages_to_free); } /* Racy check whether the huge page can be split */ @@ -2830,7 +2866,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), false); ret = -EBUSY; } diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963..504ea5d7fd43 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,13 +168,62 @@ void putback_movable_pages(struct list_head *l) } } +static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page) +{ + void *addr; + bool dirty; + pte_t newpte; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. Therefore, this subpage is + * inaccessible. We don't need to remap it if it contains only zeros. + */ + addr = kmap_local_page(page); + dirty = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (dirty) + return false; + + pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false); + + if (userfaultfd_armed(pvmw->vma)) { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)), + pvmw->vma->vm_page_prot)); + ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES); + count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE); + return true; + } + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page)); + count_vm_event(THP_SPLIT_UNMAP); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool unmap_clean; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -197,6 +246,8 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -272,13 +323,20 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .unmap_clean = unmap_clean, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; + VM_BUG_ON_FOLIO(unmap_clean && src != dst, src); + if (locked) rmap_walk_locked(dst, &rwc); else @@ -866,7 +924,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1122,7 +1180,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); out_unlock_both: folio_unlock(dst); @@ -1332,7 +1390,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); unlock_put_anon: folio_unlock(dst); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed8..b5160c7ee229 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -426,7 +426,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); migrate->src[i] = 0; folio_unlock(folio); @@ -802,7 +802,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, false, false); folio_unlock(src); if (is_zone_device_page(page)) diff --git a/mm/vmstat.c b/mm/vmstat.c index b2371d745e00..3d802eb6754d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1359,6 +1359,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif + "thp_split_free", + "thp_split_unmap", + "thp_split_remap_readonly_zero_page", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", "thp_swpout", diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 76e1c36dd9e5..de227b88d43d 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -16,6 +16,9 @@ #include <sys/mount.h> #include <malloc.h> #include <stdbool.h> +#include <sys/syscall.h> /* Definition of SYS_* constants */ +#include <linux/userfaultfd.h> +#include <sys/ioctl.h> #include "vm_util.h" uint64_t pagesize; @@ -88,6 +91,113 @@ static void write_debugfs(const char *fmt, ...) } } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +{ + uint64_t rss_anon_before, rss_anon_after; + size_t i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + +void split_pmd_zero_pages_uffd(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + one_page = allocate_zero_filled_hugepage(len); + + uffdio_register.range.start = (unsigned long)one_page; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } + + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages with uffd successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -121,7 +231,6 @@ void split_pmd_thp(void) exit(EXIT_FAILURE); } - if (check_huge_anon(one_page, 0, pmd_pagesize)) { printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); @@ -301,6 +410,8 @@ int main(int argc, char **argv) pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); + split_pmd_zero_pages(); + split_pmd_zero_pages_uffd(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index f11f8adda521..72f3edc64aaf 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -6,6 +6,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 uint64_t pagemap_get_entry(int fd, char *start) @@ -72,6 +73,28 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +uint64_t rss_anon(void) +{ + uint64_t rss_anon = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 5c35de454e08..dd1885f66097 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -1,12 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <stdint.h> #include <stdbool.h> +#include <stddef.h> +#include <stdio.h> uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); +uint64_t rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);

[v2,2/3] mm: changes to split_huge_page() to free zero filled tail pages

Commit Message

Patch