@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_USAGE_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -707,6 +718,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
@@ -1092,6 +1092,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_USAGE_PGOFF (LRU_GEN_PGOFF - LRU_USAGE_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
@@ -79,11 +79,206 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
+
+static inline bool lru_gen_enabled(void)
+{
+ return static_branch_likely(&lru_gen_static_key);
+}
+#else
+DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
+
+static inline bool lru_gen_enabled(void)
+{
+ return static_branch_unlikely(&lru_gen_static_key);
+}
+#endif
+
+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+/* Return a proper index regardless whether we keep a full history of stats. */
+static inline int lru_hist_from_seq(int seq)
+{
+ return seq % NR_STAT_GENS;
+}
+
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
+ return order_base_2(usage + 1);
+}
+
+/* The youngest and the second youngest generations are counted as active. */
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq);
+
+ VM_BUG_ON(!max_seq);
+ VM_BUG_ON(gen >= MAX_NR_GENS);
+
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+/* Update the sizes of the multigenerational lru lists. */
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
+ int old_gen, int new_gen)
+{
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ int delta = thp_nr_pages(page);
+ enum lru_list lru = type * LRU_FILE;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
+ lrugen->sizes[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
+ lrugen->sizes[new_gen][type][zone] + delta);
+
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+ update_lru_size(lruvec, lru, zone, -delta);
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+ }
+
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+}
+
+/* Add a page to one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ if (PageUnevictable(page) || !lrugen->enabled[type])
+ return false;
+ /*
+ * If a page shouldn't be considered for eviction, i.e., a page mapped
+ * upon fault during which the accessed bit is set, add it to the
+ * youngest generation.
+ *
+ * If a page can't be evicted immediately, i.e., an anon page not in
+ * swap cache or a dirty page pending writeback, add it to the second
+ * oldest generation.
+ *
+ * If a page could be evicted immediately, e.g., a clean page, add it to
+ * the oldest generation.
+ */
+ if (PageActive(page))
+ gen = lru_gen_from_seq(lrugen->max_seq);
+ else if ((!type && !PageSwapCache(page)) ||
+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
+ else
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ do {
+ new_flags = old_flags = READ_ONCE(page->flags);
+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
+
+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, -1, gen);
+ if (reclaiming)
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+
+ do {
+ new_flags = old_flags = READ_ONCE(page->flags);
+ if (!(new_flags & LRU_GEN_MASK))
+ return false;
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ new_flags &= ~LRU_GEN_MASK;
+ if ((new_flags & LRU_TIER_FLAGS) != LRU_TIER_FLAGS)
+ new_flags &= ~(LRU_USAGE_MASK | LRU_TIER_FLAGS);
+ /* see the comment on PageReferenced()/PageReclaim() in shrink_page_list() */
+ if (reclaiming)
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
+ else if (lru_gen_is_active(lruvec, gen))
+ new_flags |= BIT(PG_active);
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, gen, -1);
+ list_del(&page->lru);
+
+ return true;
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_add_page(page, lruvec, false))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
@@ -93,6 +288,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_add_page(page, lruvec, true))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
@@ -100,6 +298,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
+ if (lru_gen_del_page(page, lruvec, false))
+ return;
+
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
@@ -294,6 +294,94 @@ enum lruvec_flags {
*/
};
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+/*
+ * For each lruvec, evictable pages are divided into multiple generations. The
+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
+ * monotonically increasing. The sliding window technique is used to track at
+ * most MAX_NR_GENS and at least MIN_NR_GENS generations. An offset within the
+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
+ * corresponding generation. The counter in page->flags stores gen+1 while a
+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
+ */
+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
+/*
+ * Each generation is then divided into multiple tiers. Tiers represent levels
+ * of usage from file descriptors, i.e., mark_page_accessed(). In contrast to
+ * moving across generations which requires the lru lock, moving across tiers
+ * only involves an atomic operation on page->flags and therefore has a
+ * negligible cost.
+ *
+ * The purposes of tiers are to:
+ * 1) estimate whether pages accessed multiple times via file descriptors are
+ * more active than pages accessed only via page tables by separating the two
+ * access types into upper tiers and the base tier and comparing refault rates
+ * across tiers.
+ * 2) improve buffered io performance by deferring the protection of pages
+ * accessed multiple times until the eviction. That is the protection happens
+ * in the reclaim path, not the access path.
+ *
+ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
+ * The base tier may be marked by PageReferenced(). All upper tiers are marked
+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
+ * used to support more than one upper tier.
+ */
+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
+#define LRU_TIER_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
+/* Whether to keep historical stats for each generation. */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_STAT_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+#else
+#define NR_STAT_GENS 1U
+#endif
+
+struct lrugen {
+ /* the aging increments the max generation number */
+ unsigned long max_seq;
+ /* the eviction increments the min generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
+ /* the multigenerational lru lists */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the sizes of the multigenerational lru lists in pages */
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* to determine which type and its tiers to evict */
+ atomic_long_t refaulted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ atomic_long_t evicted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* the base tier isn't protected, hence the minus one */
+ unsigned long protected[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+ /* the exponential moving average of refaulted */
+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+ /* the exponential moving average of evicted+protected */
+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multigenerational lru is enabled */
+ bool enabled[ANON_AND_FILE];
+};
+
+void lru_gen_init_lrugen(struct lruvec *lruvec);
+void lru_gen_set_state(bool enable, bool main, bool swap);
+
+#else /* CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lrugen(struct lruvec *lruvec)
+{
+}
+
+static inline void lru_gen_set_state(bool enable, bool main, bool swap)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -311,6 +399,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* unevictable pages are on LRU_UNEVICTABLE */
+ struct lrugen evictable;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -26,6 +26,14 @@
#define ZONES_WIDTH ZONES_SHIFT
+#ifdef CONFIG_LRU_GEN
+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
+#define LRU_USAGE_WIDTH (CONFIG_TIERS_PER_GEN - 2)
+#else
+#define LRU_GEN_WIDTH 0
+#define LRU_USAGE_WIDTH 0
+#endif
+
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
@@ -55,7 +63,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +98,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,8 +109,8 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
@@ -848,7 +848,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -859,7 +859,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+ ((((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_USAGE_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
@@ -22,6 +22,9 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
+#endif
/* End of constants */
return 0;
@@ -146,7 +146,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
@@ -2390,7 +2390,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_USAGE_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_USAGE_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+ lru_gen_init_lrugen(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ lru_gen_set_state(false, false, true);
out_dput:
filp_close(victim, NULL);
@@ -3343,6 +3344,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ lru_gen_set_state(true, false, true);
error = 0;
goto out;
@@ -49,6 +49,7 @@
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2731,6 +2732,334 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
}
+#ifdef CONFIG_LRU_GEN
+
+/*
+ * After a page is faulted in, the aging must scan it twice before the eviction
+ * can consider it. The first scan clears the accessed bit set during the
+ * initial fault. And the second scan makes sure it hasn't been used since the
+ * first scan.
+ */
+#define MIN_NR_GENS 2
+
+#define MAX_BATCH_SIZE 8192
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define DEFINE_MAX_SEQ(lruvec) \
+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq)
+
+#define DEFINE_MIN_SEQ(lruvec) \
+ unsigned long min_seq[ANON_AND_FILE] = { \
+ READ_ONCE((lruvec)->evictable.min_seq[0]), \
+ READ_ONCE((lruvec)->evictable.min_seq[1]), \
+ }
+
+#define for_each_type_zone(type, zone) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static int page_lru_gen(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static int page_lru_tier(struct page *page)
+{
+ int usage;
+ unsigned long flags = READ_ONCE(page->flags);
+
+ usage = (flags & LRU_TIER_FLAGS) == LRU_TIER_FLAGS ?
+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
+
+ return lru_tier_from_usage(usage);
+}
+
+static int get_lo_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness)
+{
+ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1;
+}
+
+static int get_hi_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness)
+{
+ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1;
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
+}
+
+static int get_swappiness(struct mem_cgroup *memcg)
+{
+ return mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
+ mem_cgroup_swappiness(memcg) : 0;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS &&
+ get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
+#else
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
+#endif
+
+static DEFINE_MUTEX(lru_gen_state_mutex);
+static int lru_gen_nr_swapfiles;
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ enum lru_list lru;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ for_each_evictable_lru(lru) {
+ type = is_file_lru(lru);
+
+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+
+ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
+ }
+
+ return true;
+}
+
+static bool fill_lists(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_BATCH_SIZE;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ if (!lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_lru_gen(page) >= 0, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ del_page_from_lru_list(page, lruvec);
+ success = lru_gen_add_page(page, lruvec, false);
+ VM_BUG_ON(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_lists(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_BATCH_SIZE;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
+
+ if (lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ success = lru_gen_del_page(page, lruvec, false);
+ VM_BUG_ON(!success);
+ add_page_to_lru_list(page, lruvec);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * For file page tracking, we enable/disable it according to the main switch.
+ * For anon page tracking, we only enabled it when the main switch is on and
+ * there is at least one swapfile; we disable it when there are no swapfiles
+ * regardless of the value of the main switch. Otherwise, we will eventually
+ * reach the max size of the sliding window and have to call inc_min_seq().
+ */
+void lru_gen_set_state(bool enable, bool main, bool swap)
+{
+ struct mem_cgroup *memcg;
+
+ mem_hotplug_begin();
+ mutex_lock(&lru_gen_state_mutex);
+ cgroup_lock();
+
+ if (swap) {
+ if (enable)
+ swap = !lru_gen_nr_swapfiles++;
+ else
+ swap = !--lru_gen_nr_swapfiles;
+ }
+
+ if (main && enable != lru_gen_enabled()) {
+ if (enable)
+ static_branch_enable(&lru_gen_static_key);
+ else
+ static_branch_disable(&lru_gen_static_key);
+ } else if (!swap || !lru_gen_enabled())
+ goto unlock;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lrugen->enabled[1] = lru_gen_enabled();
+
+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ cgroup_unlock();
+ mutex_unlock(&lru_gen_state_mutex);
+ mem_hotplug_done();
+}
+
+static int __meminit __maybe_unused mem_notifier(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+ struct memory_notify *mn = arg;
+ int nid = mn->status_change_nid;
+
+ if (nid == NUMA_NO_NODE)
+ return NOTIFY_DONE;
+
+ pgdat = NODE_DATA(nid);
+
+ if (action != MEM_GOING_ONLINE)
+ return NOTIFY_DONE;
+
+ mutex_lock(&lru_gen_state_mutex);
+ cgroup_lock();
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lrugen->enabled[1] = lru_gen_enabled();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ cgroup_unlock();
+ mutex_unlock(&lru_gen_state_mutex);
+
+ return NOTIFY_DONE;
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lrugen(struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lrugen->enabled[1] = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ if (hotplug_memory_notifier(mem_notifier, 0))
+ pr_err("lru_gen: failed to subscribe hotplug notifications\n");
+
+ return 0;
+};
+/*
+ * We want to run as early as possible because debug code may call mm_alloc()
+ * and mmput(). Our only dependency mm_kobj is initialized one stage earlier.
+ */
+arch_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];