diff mbox series

[v2,11/16] mm: multigenerational lru: aging

Message ID 20210413065633.2782273-12-yuzhao@google.com (mailing list archive)
State New, archived
Headers show
Series Multigenerational LRU Framework | expand

Commit Message

Yu Zhao April 13, 2021, 6:56 a.m. UTC
The aging produces young generations. Given an lruvec, the aging walks
the mm_struct list associated with this lruvec to scan page tables for
referenced pages. Upon finding one, the aging updates the generation
number of this page to max_seq. After each round of scan, the aging
increments max_seq. The aging is due when both of min_seq[2] reaches
max_seq-1, assuming both anon and file types are reclaimable.

The aging uses the following optimizations when scanning page tables:
  1) It will not scan page tables from processes that have been
  sleeping since the last scan.
  2) It will not scan PTE tables under non-leaf PMD entries that do
  not have the accessed bit set, when
  CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
  3) It will not zigzag between the PGD table and the same PMD or PTE
  table spanning multiple VMAs. In other words, it finishes all the
  VMAs with the range of the same PMD or PTE table before it returns
  to the PGD table. This optimizes workloads that have large numbers
  of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 mm/vmscan.c | 700 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 700 insertions(+)
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d67dfd1e3930..31e1b4155677 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@ 
 #include <linux/dax.h>
 #include <linux/psi.h>
 #include <linux/memory.h>
+#include <linux/pagewalk.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -4771,6 +4772,702 @@  static bool get_next_mm(struct mm_walk_args *args, int swappiness, struct mm_str
 	return last;
 }
 
+/******************************************************************************
+ *                          the aging
+ ******************************************************************************/
+
+static void update_batch_size(struct page *page, int old_gen, int new_gen,
+			      struct mm_walk_args *args)
+{
+	int file = page_is_file_lru(page);
+	int zone = page_zonenum(page);
+	int delta = thp_nr_pages(page);
+
+	VM_BUG_ON(old_gen >= MAX_NR_GENS);
+	VM_BUG_ON(new_gen >= MAX_NR_GENS);
+
+	args->batch_size++;
+
+	args->nr_pages[old_gen][file][zone] -= delta;
+	args->nr_pages[new_gen][file][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
+{
+	int gen, file, zone;
+	struct lrugen *lrugen = &lruvec->evictable;
+
+	args->batch_size = 0;
+
+	spin_lock_irq(&lruvec->lru_lock);
+
+	for_each_gen_type_zone(gen, file, zone) {
+		enum lru_list lru = LRU_FILE * file;
+		int total = args->nr_pages[gen][file][zone];
+
+		if (!total)
+			continue;
+
+		args->nr_pages[gen][file][zone] = 0;
+		WRITE_ONCE(lrugen->sizes[gen][file][zone],
+			   lrugen->sizes[gen][file][zone] + total);
+
+		if (lru_gen_is_active(lruvec, gen))
+			lru += LRU_ACTIVE;
+		update_lru_size(lruvec, lru, zone, total);
+	}
+
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static int page_update_gen(struct page *page, int new_gen)
+{
+	int old_gen;
+	unsigned long old_flags, new_flags;
+
+	VM_BUG_ON(new_gen >= MAX_NR_GENS);
+
+	do {
+		old_flags = READ_ONCE(page->flags);
+
+		old_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+		if (old_gen < 0)
+			new_flags = old_flags | BIT(PG_referenced);
+		else
+			new_flags = (old_flags & ~(LRU_GEN_MASK | LRU_USAGE_MASK |
+				     LRU_TIER_FLAGS)) | ((new_gen + 1UL) << LRU_GEN_PGOFF);
+
+		if (old_flags == new_flags)
+			break;
+	} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+	return old_gen;
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct mm_walk_args *args = walk->private;
+
+	if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
+	    (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)))
+		return true;
+
+	if (vma_is_anonymous(vma))
+		return !args->should_walk[0];
+
+	if (vma_is_shmem(vma))
+		return !args->should_walk[0] ||
+		       mapping_unevictable(vma->vm_file->f_mapping);
+
+	return !args->should_walk[1] || vma_is_dax(vma) ||
+	       vma == get_gate_vma(vma->vm_mm) ||
+	       mapping_unevictable(vma->vm_file->f_mapping);
+}
+
+/*
+ * Some userspace memory allocators create many single-page VMAs. So instead of
+ * returning back to the PGD table for each of such VMAs, we finish at least an
+ * entire PMD table and therefore avoid many zigzags. This optimizes page table
+ * walks for workloads that have large numbers of tiny VMAs.
+ *
+ * We scan PMD tables in two pass. The first pass reaches to PTE tables and
+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD
+ * entries and needs to take the PMD lock. The second pass is only done on the
+ * PMD entries that first pass has found the accessed bit is set, and they must
+ * be:
+ *   1) leaf entries mapping huge pages from the node under reclaim
+ *   2) non-leaf entries whose leaf entries only map pages from the node under
+ *   reclaim, when CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
+ */
+static bool get_next_interval(struct mm_walk *walk, unsigned long mask, unsigned long size,
+			      unsigned long *start, unsigned long *end)
+{
+	unsigned long next = round_up(*end, size);
+	struct mm_walk_args *args = walk->private;
+
+	VM_BUG_ON(mask & size);
+	VM_BUG_ON(*start != *end);
+	VM_BUG_ON(!(*end & ~mask));
+	VM_BUG_ON((*end & mask) != (next & mask));
+
+	while (walk->vma) {
+		if (next >= walk->vma->vm_end) {
+			walk->vma = walk->vma->vm_next;
+			continue;
+		}
+
+		if ((next & mask) != (walk->vma->vm_start & mask))
+			return false;
+
+		if (next <= walk->vma->vm_start &&
+		    should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
+			walk->vma = walk->vma->vm_next;
+			continue;
+		}
+
+		args->mm_stats[MM_VMA_INTERVAL]++;
+
+		*start = max(next, walk->vma->vm_start);
+		next = (next | ~mask) + 1;
+		/* rounded-up boundaries can wrap to 0 */
+		*end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+			   struct mm_walk *walk)
+{
+	int i;
+	pte_t *pte;
+	spinlock_t *ptl;
+	int remote = 0;
+	struct mm_walk_args *args = walk->private;
+	int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
+
+	VM_BUG_ON(pmd_leaf(*pmd));
+
+	pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl);
+	arch_enter_lazy_mmu_mode();
+restart:
+	for (i = pte_index(start); start != end; i++, start += PAGE_SIZE) {
+		struct page *page;
+		unsigned long pfn = pte_pfn(pte[i]);
+
+		if (!pte_present(pte[i]) || is_zero_pfn(pfn)) {
+			args->mm_stats[MM_LEAF_HOLE]++;
+			continue;
+		}
+
+		if (!pte_young(pte[i])) {
+			args->mm_stats[MM_LEAF_OLD]++;
+			continue;
+		}
+
+		if (pfn < args->start_pfn || pfn >= args->end_pfn) {
+			remote++;
+			args->mm_stats[MM_LEAF_OTHER_NODE]++;
+			continue;
+		}
+
+		page = compound_head(pfn_to_page(pfn));
+		if (page_to_nid(page) != args->node_id) {
+			remote++;
+			args->mm_stats[MM_LEAF_OTHER_NODE]++;
+			continue;
+		}
+
+		if (!ptep_test_and_clear_young(walk->vma, start, pte + i))
+			continue;
+
+		if (pte_dirty(pte[i]) && !PageDirty(page) &&
+		    !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) {
+			set_page_dirty(page);
+			args->mm_stats[MM_LEAF_DIRTY]++;
+		}
+
+		if (page_memcg_rcu(page) != args->memcg) {
+			args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
+			continue;
+		}
+
+		old_gen = page_update_gen(page, new_gen);
+		if (old_gen >= 0 && old_gen != new_gen)
+			update_batch_size(page, old_gen, new_gen, args);
+		args->mm_stats[MM_LEAF_YOUNG]++;
+	}
+
+	if (i < PTRS_PER_PTE && get_next_interval(walk, PMD_MASK, PAGE_SIZE, &start, &end))
+		goto restart;
+
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(pte, ptl);
+
+	return !remote;
+}
+
+static bool walk_pmd_range_unlocked(pud_t *pud, unsigned long start, unsigned long end,
+				    struct mm_walk *walk)
+{
+	int i;
+	pmd_t *pmd;
+	unsigned long next;
+	int young = 0;
+	struct mm_walk_args *args = walk->private;
+
+	VM_BUG_ON(pud_leaf(*pud));
+
+	pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+	for (i = pmd_index(start); start != end; i++, start = next) {
+		pmd_t val = pmd_read_atomic(pmd + i);
+
+		next = pmd_addr_end(start, end);
+
+		barrier();
+		if (!pmd_present(val) || is_huge_zero_pmd(val)) {
+			args->mm_stats[MM_LEAF_HOLE]++;
+			continue;
+		}
+
+		if (pmd_trans_huge(val)) {
+			unsigned long pfn = pmd_pfn(val);
+
+			if (!pmd_young(val)) {
+				args->mm_stats[MM_LEAF_OLD]++;
+				continue;
+			}
+
+			if (pfn < args->start_pfn || pfn >= args->end_pfn) {
+				args->mm_stats[MM_LEAF_OTHER_NODE]++;
+				continue;
+			}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+			young++;
+			__set_bit(i, args->bitmap);
+#endif
+			continue;
+		}
+
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
+		if (!pmd_young(val)) {
+			args->mm_stats[MM_NONLEAF_OLD]++;
+			continue;
+		}
+#endif
+
+		if (walk_pte_range(&val, start, next, walk)) {
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
+			young++;
+			__set_bit(i, args->bitmap);
+#endif
+		}
+	}
+
+	if (i < PTRS_PER_PMD && get_next_interval(walk, PUD_MASK, PMD_SIZE, &start, &end))
+		goto restart;
+
+	return young;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, unsigned long end,
+				  struct mm_walk *walk)
+{
+	int i;
+	pmd_t *pmd;
+	spinlock_t *ptl;
+	struct mm_walk_args *args = walk->private;
+	int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
+
+	VM_BUG_ON(pud_leaf(*pud));
+
+	start &= PUD_MASK;
+	pmd = pmd_offset(pud, start);
+	ptl = pmd_lock(walk->mm, pmd);
+	arch_enter_lazy_mmu_mode();
+
+	for_each_set_bit(i, args->bitmap, PTRS_PER_PMD) {
+		struct page *page;
+		unsigned long pfn = pmd_pfn(pmd[i]);
+		unsigned long addr = start + PMD_SIZE * i;
+
+		if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) {
+			args->mm_stats[MM_LEAF_HOLE]++;
+			continue;
+		}
+
+		if (!pmd_young(pmd[i])) {
+			args->mm_stats[MM_LEAF_OLD]++;
+			continue;
+		}
+
+		if (!pmd_trans_huge(pmd[i])) {
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
+			args->mm_stats[MM_NONLEAF_YOUNG]++;
+			pmdp_test_and_clear_young(walk->vma, addr, pmd + i);
+#endif
+			continue;
+		}
+
+		if (pfn < args->start_pfn || pfn >= args->end_pfn) {
+			args->mm_stats[MM_LEAF_OTHER_NODE]++;
+			continue;
+		}
+
+		page = pfn_to_page(pfn);
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		if (page_to_nid(page) != args->node_id) {
+			args->mm_stats[MM_LEAF_OTHER_NODE]++;
+			continue;
+		}
+
+		if (!pmdp_test_and_clear_young(walk->vma, addr, pmd + i))
+			continue;
+
+		if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
+		    !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) {
+			set_page_dirty(page);
+			args->mm_stats[MM_LEAF_DIRTY]++;
+		}
+
+		if (page_memcg_rcu(page) != args->memcg) {
+			args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
+			continue;
+		}
+
+		old_gen = page_update_gen(page, new_gen);
+		if (old_gen >= 0 && old_gen != new_gen)
+			update_batch_size(page, old_gen, new_gen, args);
+		args->mm_stats[MM_LEAF_YOUNG]++;
+	}
+
+	arch_leave_lazy_mmu_mode();
+	spin_unlock(ptl);
+
+	memset(args->bitmap, 0, sizeof(args->bitmap));
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, unsigned long end,
+				  struct mm_walk *walk)
+{
+}
+#endif
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+			  struct mm_walk *walk)
+{
+	int i;
+	pud_t *pud;
+	unsigned long next;
+	struct mm_walk_args *args = walk->private;
+
+	VM_BUG_ON(p4d_leaf(*p4d));
+
+	pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+	for (i = pud_index(start); start != end; i++, start = next) {
+		pud_t val = READ_ONCE(pud[i]);
+
+		next = pud_addr_end(start, end);
+
+		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+			continue;
+
+		if (walk_pmd_range_unlocked(&val, start, next, walk))
+			walk_pmd_range_locked(&val, start, next, walk);
+
+		if (args->batch_size >= MAX_BATCH_SIZE) {
+			end = (start | ~PUD_MASK) + 1;
+			goto done;
+		}
+	}
+
+	if (i < PTRS_PER_PUD && get_next_interval(walk, P4D_MASK, PUD_SIZE, &start, &end))
+		goto restart;
+
+	end = round_up(end, P4D_SIZE);
+done:
+	/* rounded-up boundaries can wrap to 0 */
+	args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
+
+	return -EAGAIN;
+}
+
+static void walk_mm(struct mm_walk_args *args, int swappiness, struct mm_struct *mm)
+{
+	static const struct mm_walk_ops mm_walk_ops = {
+		.test_walk = should_skip_vma,
+		.p4d_entry = walk_pud_range,
+	};
+
+	int err;
+	int file;
+	int nid = args->node_id;
+	struct mem_cgroup *memcg = args->memcg;
+	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+	args->next_addr = FIRST_USER_ADDRESS;
+	for (file = !swappiness; file < ANON_AND_FILE; file++)
+		args->should_walk[file] = lru_gen_mm_is_active(mm) ||
+					  node_isset(nid, mm->lrugen.nodes[file]);
+
+	do {
+		unsigned long start = args->next_addr;
+		unsigned long end = mm->highest_vm_end;
+
+		err = -EBUSY;
+
+		preempt_disable();
+		rcu_read_lock();
+
+#ifdef CONFIG_MEMCG
+		if (memcg && atomic_read(&memcg->moving_account)) {
+			args->mm_stats[MM_LOCK_CONTENTION]++;
+			goto contended;
+		}
+#endif
+		if (!mmap_read_trylock(mm)) {
+			args->mm_stats[MM_LOCK_CONTENTION]++;
+			goto contended;
+		}
+
+		err = walk_page_range(mm, start, end, &mm_walk_ops, args);
+
+		mmap_read_unlock(mm);
+
+		if (args->batch_size)
+			reset_batch_size(lruvec, args);
+contended:
+		rcu_read_unlock();
+		preempt_enable();
+
+		cond_resched();
+	} while (err == -EAGAIN && args->next_addr &&
+		 !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg));
+
+	if (err == -EBUSY)
+		return;
+
+	for (file = !swappiness; file < ANON_AND_FILE; file++) {
+		if (args->should_walk[file])
+			node_clear(nid, mm->lrugen.nodes[file]);
+	}
+}
+
+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool front)
+{
+	int old_gen, new_gen;
+	unsigned long old_flags, new_flags;
+	int file = page_is_file_lru(page);
+	int zone = page_zonenum(page);
+	struct lrugen *lrugen = &lruvec->evictable;
+
+	old_gen = lru_gen_from_seq(lrugen->min_seq[file]);
+
+	do {
+		old_flags = READ_ONCE(page->flags);
+		new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+		VM_BUG_ON_PAGE(new_gen < 0, page);
+		if (new_gen >= 0 && new_gen != old_gen)
+			goto sort;
+
+		new_gen = (old_gen + 1) % MAX_NR_GENS;
+		new_flags = (old_flags & ~(LRU_GEN_MASK | LRU_USAGE_MASK | LRU_TIER_FLAGS)) |
+			    ((new_gen + 1UL) << LRU_GEN_PGOFF);
+		/* mark the page for reclaim if it's pending writeback */
+		if (front)
+			new_flags |= BIT(PG_reclaim);
+	} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+	lru_gen_update_size(page, lruvec, old_gen, new_gen);
+sort:
+	if (front)
+		list_move(&page->lru, &lrugen->lists[new_gen][file][zone]);
+	else
+		list_move_tail(&page->lru, &lrugen->lists[new_gen][file][zone]);
+}
+
+static bool try_inc_min_seq(struct lruvec *lruvec, int file)
+{
+	int gen, zone;
+	bool success = false;
+	struct lrugen *lrugen = &lruvec->evictable;
+
+	VM_BUG_ON(!seq_is_valid(lruvec));
+
+	while (get_nr_gens(lruvec, file) > MIN_NR_GENS) {
+		gen = lru_gen_from_seq(lrugen->min_seq[file]);
+
+		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+			if (!list_empty(&lrugen->lists[gen][file][zone]))
+				return success;
+		}
+
+		reset_controller_pos(lruvec, gen, file);
+		WRITE_ONCE(lrugen->min_seq[file], lrugen->min_seq[file] + 1);
+
+		success = true;
+	}
+
+	return success;
+}
+
+static bool inc_min_seq(struct lruvec *lruvec, int file)
+{
+	int gen, zone;
+	int batch_size = 0;
+	struct lrugen *lrugen = &lruvec->evictable;
+
+	VM_BUG_ON(!seq_is_valid(lruvec));
+
+	if (get_nr_gens(lruvec, file) != MAX_NR_GENS)
+		return true;
+
+	gen = lru_gen_from_seq(lrugen->min_seq[file]);
+
+	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+		struct list_head *head = &lrugen->lists[gen][file][zone];
+
+		while (!list_empty(head)) {
+			struct page *page = lru_to_page(head);
+
+			VM_BUG_ON_PAGE(PageTail(page), page);
+			VM_BUG_ON_PAGE(PageUnevictable(page), page);
+			VM_BUG_ON_PAGE(PageActive(page), page);
+			VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
+			VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+			prefetchw_prev_lru_page(page, head, flags);
+
+			page_inc_gen(page, lruvec, false);
+
+			if (++batch_size == MAX_BATCH_SIZE)
+				return false;
+		}
+
+		VM_BUG_ON(lrugen->sizes[gen][file][zone]);
+	}
+
+	reset_controller_pos(lruvec, gen, file);
+	WRITE_ONCE(lrugen->min_seq[file], lrugen->min_seq[file] + 1);
+
+	return true;
+}
+
+static void inc_max_seq(struct lruvec *lruvec)
+{
+	int gen, file, zone;
+	struct lrugen *lrugen = &lruvec->evictable;
+
+	spin_lock_irq(&lruvec->lru_lock);
+
+	VM_BUG_ON(!seq_is_valid(lruvec));
+
+	for (file = 0; file < ANON_AND_FILE; file++) {
+		if (try_inc_min_seq(lruvec, file))
+			continue;
+
+		while (!inc_min_seq(lruvec, file)) {
+			spin_unlock_irq(&lruvec->lru_lock);
+			cond_resched();
+			spin_lock_irq(&lruvec->lru_lock);
+		}
+	}
+
+	gen = lru_gen_from_seq(lrugen->max_seq - 1);
+	for_each_type_zone(file, zone) {
+		enum lru_list lru = LRU_FILE * file;
+		long total = lrugen->sizes[gen][file][zone];
+
+		if (!total)
+			continue;
+
+		WARN_ON_ONCE(total != (int)total);
+
+		update_lru_size(lruvec, lru, zone, total);
+		update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -total);
+	}
+
+	gen = lru_gen_from_seq(lrugen->max_seq + 1);
+	for_each_type_zone(file, zone) {
+		VM_BUG_ON(lrugen->sizes[gen][file][zone]);
+		VM_BUG_ON(!list_empty(&lrugen->lists[gen][file][zone]));
+	}
+
+	for (file = 0; file < ANON_AND_FILE; file++)
+		reset_controller_pos(lruvec, gen, file);
+
+	WRITE_ONCE(lrugen->timestamps[gen], jiffies);
+	/* make sure all preceding modifications appear first */
+	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+/* Main function used by foreground, background and user-triggered aging. */
+static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq,
+			 struct scan_control *sc, int swappiness, struct mm_walk_args *args)
+{
+	bool last;
+	bool alloc = !args;
+	struct mm_struct *mm = NULL;
+	struct lrugen *lrugen = &lruvec->evictable;
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	int nid = pgdat->node_id;
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+	VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
+
+	/*
+	 * For each walk of the mm_struct list of a memcg, we decrement the
+	 * priority of its lrugen. For each walk of all memcgs in kswapd, we
+	 * increment the priority of every lrugen.
+	 *
+	 * So if this lrugen has a higher priority (smaller value), it means
+	 * other concurrent reclaimers have walked its mm list, and we skip it
+	 * for this priority in order to balance the pressure on all memcgs.
+	 */
+	if (!mem_cgroup_disabled() && !cgroup_reclaim(sc) &&
+	    sc->priority > atomic_read(&lrugen->priority))
+		return false;
+
+	if (alloc) {
+		args = kvzalloc_node(sizeof(*args), GFP_KERNEL, nid);
+		if (!args)
+			return false;
+	}
+
+	args->memcg = memcg;
+	args->max_seq = max_seq;
+	args->start_pfn = pgdat->node_start_pfn;
+	args->end_pfn = pgdat_end_pfn(pgdat);
+	args->node_id = nid;
+
+	do {
+		last = get_next_mm(args, swappiness, &mm);
+		if (mm)
+			walk_mm(args, swappiness, mm);
+
+		cond_resched();
+	} while (mm);
+
+	if (alloc)
+		kvfree(args);
+
+	if (!last) {
+		/* foreground aging prefers not to wait unless "necessary" */
+		if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
+			wait_event_killable(mm_list->nodes[nid].wait,
+					    max_seq < READ_ONCE(lrugen->max_seq));
+
+		return max_seq < READ_ONCE(lrugen->max_seq);
+	}
+
+	VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
+
+	inc_max_seq(lruvec);
+
+	if (!mem_cgroup_disabled())
+		atomic_add_unless(&lrugen->priority, -1, 0);
+
+	/* order against inc_max_seq() */
+	smp_mb();
+	/* either we see any waiters or they will see the updated max_seq */
+	if (waitqueue_active(&mm_list->nodes[nid].wait))
+		wake_up_all(&mm_list->nodes[nid].wait);
+
+	wakeup_flusher_threads(WB_REASON_VMSCAN);
+
+	return true;
+}
+
 /******************************************************************************
  *                          state change
  ******************************************************************************/
@@ -5002,6 +5699,9 @@  static int __init init_lru_gen(void)
 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
 	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+	BUILD_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE);
+	BUILD_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD);
+	BUILD_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD);
 
 	if (mem_cgroup_disabled()) {
 		global_mm_list = alloc_mm_list();