@@ -437,6 +437,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -498,6 +499,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -935,6 +937,23 @@ void unlock_page_memcg(struct page *page);
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -1372,6 +1391,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
@@ -1588,6 +1588,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
#ifdef CONFIG_MIGRATION
static inline bool is_pinnable_page(struct page *page)
@@ -304,6 +304,7 @@ enum lruvec_flags {
};
struct lruvec;
+struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -399,6 +400,7 @@ struct lru_gen_struct {
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -411,6 +413,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
@@ -372,6 +372,7 @@ extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
+extern void folio_activate(struct folio *folio);
extern void deactivate_file_page(struct page *page);
extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
@@ -2744,6 +2744,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
@@ -73,6 +73,7 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
@@ -819,6 +820,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
@@ -344,7 +344,7 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -364,7 +364,7 @@ static inline void activate_page_drain(int cpu)
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
@@ -1558,6 +1558,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ page_mapped(page) && PageReferenced(page))
+ goto keep_locked;
+
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -3225,6 +3230,31 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
* the aging
******************************************************************************/
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long old_flags, new_flags;
+
+ VM_BUG_ON(gen >= MAX_NR_GENS);
+ VM_BUG_ON(!rcu_read_lock_held());
+
+ do {
+ new_flags = old_flags = READ_ONCE(folio->flags);
+
+ /* for shrink_page_list() */
+ if (!(new_flags & LRU_GEN_MASK)) {
+ new_flags |= BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags &= ~LRU_GEN_MASK;
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ } while (new_flags != old_flags &&
+ cmpxchg(&folio->flags, old_flags, new_flags) != old_flags);
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long old_flags, new_flags;
@@ -3237,6 +3267,10 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
VM_BUG_ON_FOLIO(!(new_flags & LRU_GEN_MASK), folio);
new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
new_gen = (old_gen + 1) % MAX_NR_GENS;
new_flags &= ~LRU_GEN_MASK;
@@ -3438,6 +3472,122 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
}
+/*
+ * This function exploits spatial locality when shrink_page_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ struct folio *folio;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct mem_cgroup *memcg = page_memcg(pvmw->page);
+ struct pglist_data *pgdat = page_pgdat(pvmw->page);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page);
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn = pte_pfn(pte[i]);
+
+ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
+
+ if (!pte_present(pte[i]) || is_zero_pfn(pfn))
+ continue;
+
+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ VM_BUG_ON(!pfn_valid(pfn));
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ continue;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ continue;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ continue;
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = page_folio(pte_page(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = page_folio(pte_page(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
/******************************************************************************
* the eviction
******************************************************************************/
@@ -3471,6 +3621,11 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
return true;
}
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
if (tier > tier_idx) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);