@@ -372,7 +372,7 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
@@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
THP_SPLIT_PUD,
#endif
+ THP_SPLIT_FREE,
+ THP_SPLIT_UNMAP,
+ THP_SPLIT_REMAP_READONLY_ZERO_PAGE,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
@@ -2442,7 +2442,7 @@ static void unmap_page(struct page *page)
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
}
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean)
{
int i = 0;
@@ -2450,7 +2450,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, true);
+ remove_migration_ptes(folio, folio, true, unmap_clean);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -2564,6 +2564,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
+ LIST_HEAD(pages_to_free);
+ int nr_pages_to_free = 0;
int i;
/* complete memcg works before add pages to LRU */
@@ -2626,7 +2628,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
- remap_page(folio, nr);
+ remap_page(folio, nr, PageAnon(head));
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2640,6 +2642,33 @@ static void __split_huge_page(struct page *page, struct list_head *list,
continue;
unlock_page(subpage);
+ /*
+ * If a tail page has only two references left, one inherited
+ * from the isolation of its head and the other from
+ * lru_add_page_tail() which we are about to drop, it means this
+ * tail page was concurrently zapped. Then we can safely free it
+ * and save page reclaim or migration the trouble of trying it.
+ */
+ if (list && page_ref_freeze(subpage, 2)) {
+ VM_BUG_ON_PAGE(PageLRU(subpage), subpage);
+ VM_BUG_ON_PAGE(PageCompound(subpage), subpage);
+ VM_BUG_ON_PAGE(page_mapped(subpage), subpage);
+
+ ClearPageActive(subpage);
+ ClearPageUnevictable(subpage);
+ list_move(&subpage->lru, &pages_to_free);
+ nr_pages_to_free++;
+ continue;
+ }
+ /*
+ * If a tail page has only one reference left, it will be freed
+ * by the call to free_page_and_swap_cache below. Since zero
+ * subpages are no longer remapped, there will only be one
+ * reference left in cases outside of reclaim or migration.
+ */
+ if (page_ref_count(subpage) == 1)
+ nr_pages_to_free++;
+
/*
* Subpages may be freed if there wasn't any mapping
* like if add_to_swap() is running on a lru page that
@@ -2649,6 +2678,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
*/
free_page_and_swap_cache(subpage);
}
+
+ if (!nr_pages_to_free)
+ return;
+
+ mem_cgroup_uncharge_list(&pages_to_free);
+ free_unref_page_list(&pages_to_free);
+ count_vm_events(THP_SPLIT_FREE, nr_pages_to_free);
}
/* Racy check whether the huge page can be split */
@@ -2811,7 +2847,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(folio, folio_nr_pages(folio));
+ remap_page(folio, folio_nr_pages(folio), false);
ret = -EBUSY;
}
@@ -167,13 +167,62 @@ void putback_movable_pages(struct list_head *l)
}
}
+static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page)
+{
+ void *addr;
+ bool dirty;
+ pte_t newpte;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+ if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED))
+ return false;
+
+ /*
+ * The pmd entry mapping the old thp was flushed and the pte mapping
+ * this subpage has been non present. Therefore, this subpage is
+ * inaccessible. We don't need to remap it if it contains only zeros.
+ */
+ addr = kmap_local_page(page);
+ dirty = memchr_inv(addr, 0, PAGE_SIZE);
+ kunmap_local(addr);
+
+ if (dirty)
+ return false;
+
+ pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false);
+
+ if (userfaultfd_armed(pvmw->vma)) {
+ newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)),
+ pvmw->vma->vm_page_prot));
+ ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte);
+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+ dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES);
+ count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE);
+ return true;
+ }
+
+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page));
+ count_vm_event(THP_SPLIT_UNMAP);
+ return true;
+}
+
+struct rmap_walk_arg {
+ struct folio *folio;
+ bool unmap_clean;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct rmap_walk_arg *rmap_walk_arg = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
@@ -196,6 +245,8 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new))
+ continue;
folio_get(folio);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
@@ -267,13 +318,20 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean)
{
+ struct rmap_walk_arg rmap_walk_arg = {
+ .folio = src,
+ .unmap_clean = unmap_clean,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = src,
+ .arg = &rmap_walk_arg,
};
+ VM_BUG_ON_FOLIO(unmap_clean && src != dst, src);
+
if (locked)
rmap_walk_locked(dst, &rwc);
else
@@ -849,7 +907,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, false, false);
rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1108,7 +1166,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (page_was_mapped)
remove_migration_ptes(folio,
- rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : folio, false, false);
out_unlock_both:
unlock_page(newpage);
@@ -1318,7 +1376,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false);
unlock_put_anon:
unlock_page(new_hpage);
@@ -413,7 +413,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
continue;
folio = page_folio(page);
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, false, false);
migrate->src[i] = 0;
folio_unlock(folio);
@@ -789,7 +789,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
src = page_folio(page);
dst = page_folio(newpage);
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, false, false);
folio_unlock(src);
if (is_zone_device_page(page))
@@ -1370,6 +1370,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
"thp_split_pud",
#endif
+ "thp_split_free",
+ "thp_split_unmap",
+ "thp_split_remap_readonly_zero_page",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
"thp_swpout",
@@ -16,6 +16,9 @@
#include <sys/mount.h>
#include <malloc.h>
#include <stdbool.h>
+#include <sys/syscall.h> /* Definition of SYS_* constants */
+#include <linux/userfaultfd.h>
+#include <sys/ioctl.h>
#include "vm_util.h"
uint64_t pagesize;
@@ -88,6 +91,113 @@ static void write_debugfs(const char *fmt, ...)
}
}
+static char *allocate_zero_filled_hugepage(size_t len)
+{
+ char *result;
+ size_t i;
+
+ result = memalign(pmd_pagesize, len);
+ if (!result) {
+ printf("Fail to allocate memory\n");
+ exit(EXIT_FAILURE);
+ }
+ madvise(result, len, MADV_HUGEPAGE);
+
+ for (i = 0; i < len; i++)
+ result[i] = (char)0;
+
+ return result;
+}
+
+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, size_t len)
+{
+ uint64_t thp_size, rss_anon_before, rss_anon_after;
+ size_t i;
+
+ thp_size = check_huge(one_page);
+ if (!thp_size) {
+ printf("No THP is allocated\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rss_anon_before = rss_anon();
+ if (!rss_anon_before) {
+ printf("No RssAnon is allocated before split\n");
+ exit(EXIT_FAILURE);
+ }
+ /* split all THPs */
+ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+ (uint64_t)one_page + len);
+
+ for (i = 0; i < len; i++)
+ if (one_page[i] != (char)0) {
+ printf("%ld byte corrupted\n", i);
+ exit(EXIT_FAILURE);
+ }
+
+ thp_size = check_huge(one_page);
+ if (thp_size) {
+ printf("Still %ld kB AnonHugePages not split\n", thp_size);
+ exit(EXIT_FAILURE);
+ }
+
+ rss_anon_after = rss_anon();
+ if (rss_anon_after >= rss_anon_before) {
+ printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
+ rss_anon_before, rss_anon_after);
+ exit(EXIT_FAILURE);
+ }
+}
+
+void split_pmd_zero_pages(void)
+{
+ char *one_page;
+ size_t len = 4 * pmd_pagesize;
+
+ one_page = allocate_zero_filled_hugepage(len);
+ verify_rss_anon_split_huge_page_all_zeroes(one_page, len);
+ printf("Split zero filled huge pages successful\n");
+ free(one_page);
+}
+
+void split_pmd_zero_pages_uffd(void)
+{
+ char *one_page;
+ size_t len = 4 * pmd_pagesize;
+ long uffd; /* userfaultfd file descriptor */
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+
+ /* Create and enable userfaultfd object. */
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == -1) {
+ perror("userfaultfd");
+ exit(1);
+ }
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+ perror("ioctl-UFFDIO_API");
+ exit(1);
+ }
+
+ one_page = allocate_zero_filled_hugepage(len);
+
+ uffdio_register.range.start = (unsigned long)one_page;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+ perror("ioctl-UFFDIO_REGISTER");
+ exit(1);
+ }
+
+ verify_rss_anon_split_huge_page_all_zeroes(one_page, len);
+ printf("Split zero filled huge pages with uffd successful\n");
+ free(one_page);
+}
+
void split_pmd_thp(void)
{
char *one_page;
@@ -123,7 +233,6 @@ void split_pmd_thp(void)
exit(EXIT_FAILURE);
}
-
thp_size = check_huge(one_page);
if (thp_size) {
printf("Still %ld kB AnonHugePages not split\n", thp_size);
@@ -305,6 +414,8 @@ int main(int argc, char **argv)
pageshift = ffs(pagesize) - 1;
pmd_pagesize = read_pmd_pagesize();
+ split_pmd_zero_pages();
+ split_pmd_zero_pages_uffd();
split_pmd_thp();
split_pte_mapped_thp();
split_file_backed_thp();
@@ -6,6 +6,7 @@
#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
#define SMAP_FILE_PATH "/proc/self/smaps"
+#define STATUS_FILE_PATH "/proc/self/status"
#define MAX_LINE_LENGTH 500
uint64_t pagemap_get_entry(int fd, char *start)
@@ -72,6 +73,28 @@ uint64_t read_pmd_pagesize(void)
return strtoul(buf, NULL, 10);
}
+uint64_t rss_anon(void)
+{
+ uint64_t rss_anon = 0;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+
+ fp = fopen(STATUS_FILE_PATH, "r");
+ if (!fp)
+ ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
+
+ if (!check_for_pattern(fp, "RssAnon:", buffer))
+ goto err_out;
+
+ if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1)
+ ksft_exit_fail_msg("Reading status error\n");
+
+err_out:
+ fclose(fp);
+ return rss_anon;
+}
+
uint64_t check_huge(void *addr)
{
uint64_t thp = 0;
@@ -6,4 +6,5 @@ uint64_t pagemap_get_entry(int fd, char *start);
bool pagemap_is_softdirty(int fd, char *start);
void clear_softdirty(void);
uint64_t read_pmd_pagesize(void);
+uint64_t rss_anon(void);
uint64_t check_huge(void *addr);