[v4,05/11] mm: multigenerational lru: protection

Message ID	20210818063107.2696454-6-yuzhao@google.com (mailing list archive)
State	New
Headers	show Return-Path: <SRS0=AujH=NJ=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 3BCCD60720 Date: Wed, 18 Aug 2021 00:31:01 -0600 In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> Message-Id: <20210818063107.2696454-6-yuzhao@google.com> Mime-Version: 1.0 References: <20210818063107.2696454-1-yuzhao@google.com> Subject: [PATCH v4 05/11] mm: multigenerational lru: protection From: Yu Zhao <yuzhao@google.com> To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, Hillf Danton <hdanton@sina.com>, page-reclaim@google.com, Yu Zhao <yuzhao@google.com>, Konstantin Kharlamov <Hi-Angel@yandex.ru> Content-Type: text/plain; charset="UTF-8" Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	Multigenerational LRU Framework \| expand [v4,00/11] Multigenerational LRU Framework [v4,01/11] mm: x86, arm64: add arch_has_hw_pte_young() [v4,02/11] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG [v4,03/11] mm/vmscan.c: refactor shrink_node() [v4,04/11] mm: multigenerational lru: groundwork [v4,05/11] mm: multigenerational lru: protection [v4,06/11] mm: multigenerational lru: mm_struct list [v4,07/11] mm: multigenerational lru: aging [v4,08/11] mm: multigenerational lru: eviction [v4,09/11] mm: multigenerational lru: user interface [v4,10/11] mm: multigenerational lru: Kconfig [v4,11/11] mm: multigenerational lru: documentation

diff --git a/include/linux/mm.h b/include/linux/mm.h index 159b7c94e067..7a91518792ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1778,6 +1778,25 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); + +static inline void task_enter_nonseq_fault(void) +{ + WARN_ON(current->in_nonseq_fault); + + current->in_nonseq_fault = 1; +} + +static inline void task_exit_nonseq_fault(void) +{ + WARN_ON(!current->in_nonseq_fault); + + current->in_nonseq_fault = 0; +} + +static inline bool task_in_nonseq_fault(void) +{ + return current->in_nonseq_fault; +} #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, @@ -1799,6 +1818,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } + +static inline void task_enter_nonseq_fault(void) +{ +} + +static inline void task_exit_nonseq_fault(void) +{ +} + +static inline bool task_in_nonseq_fault(void) +{ + return false; +} #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641..fd41c9c86cd1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -843,6 +843,9 @@ struct task_struct { #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif +#ifdef CONFIG_MMU + unsigned in_nonseq_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif diff --git a/mm/memory.c b/mm/memory.c index 2f96179db219..fa40a5b7a7a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4752,6 +4752,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; + bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); __set_current_state(TASK_RUNNING); @@ -4773,11 +4774,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); + if (nonseq_fault) + task_enter_nonseq_fault(); + if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); + if (nonseq_fault) + task_exit_nonseq_fault(); + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/swap.c b/mm/swap.c index 19600430e536..0d3fb2ee3fd6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -411,6 +411,43 @@ static void __lru_cache_activate_page(struct page *page) local_unlock(&lru_pvecs.lock); } +#ifdef CONFIG_LRU_GEN +static void page_inc_usage(struct page *page) +{ + unsigned long usage; + unsigned long old_flags, new_flags; + + if (PageUnevictable(page)) + return; + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags = READ_ONCE(page->flags); + + if (!(new_flags & BIT(PG_referenced))) { + new_flags |= BIT(PG_referenced); + continue; + } + + if (!(new_flags & BIT(PG_workingset))) { + new_flags |= BIT(PG_workingset); + continue; + } + + usage = new_flags & LRU_USAGE_MASK; + usage = min(usage + BIT(LRU_USAGE_PGOFF), LRU_USAGE_MASK); + + new_flags &= ~LRU_USAGE_MASK; + new_flags |= usage; + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); +} +#else +static void page_inc_usage(struct page *page) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * Mark a page as having seen activity. * @@ -425,6 +462,11 @@ void mark_page_accessed(struct page *page) { page = compound_head(page); + if (lru_gen_enabled()) { + page_inc_usage(page); + return; + } + if (!PageReferenced(page)) { SetPageReferenced(page); } else if (PageUnevictable(page)) { @@ -468,6 +510,11 @@ void lru_cache_add(struct page *page) VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + /* see the comment in lru_gen_add_page() */ + if (lru_gen_enabled() && !PageUnevictable(page) && + task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) + SetPageActive(page); + get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); @@ -569,7 +616,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { - if (PageActive(page) && !PageUnevictable(page)) { + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); @@ -684,7 +731,7 @@ void deactivate_file_page(struct page *page) */ void deactivate_page(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index a02b5ff37e31..788b4d1ce149 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; - mem_cgroup_swapout(page, swap); + + /* get a shadow entry before page_memcg() is cleared */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page, swap, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); @@ -2813,6 +2815,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, 1) <= MAX_NR_GENS; } +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop modeled after the PID controller. Currently supports the + * proportional (P) and the integral (I) terms; the derivative (D) term can be + * added if necessary. The setpoint (SP) is the desired position; the process + * variable (PV) is the measured position. The error is the difference between + * the SP and the PV. A positive error results in a positive control output + * correction, which, in our case, is to allow eviction. + * + * The P term is the refault rate of the current generation being evicted. The I + * term is the exponential moving average of the refault rates of the previous + * generations, using the smoothing factor 1/2. + * + * Our goal is to make sure upper tiers have similar refault rates as the base + * tier. That is we try to be fair to all tiers by maintaining similar refault + * rates across them. + */ +struct controller_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec, + int type, int tier, int gain) +{ + struct lrugen *lrugen = &lruvec->evictable; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_controller_pos(struct lruvec *lruvec, int gen, int type) +{ + int tier; + int hist = lru_hist_from_seq(gen); + struct lrugen *lrugen = &lruvec->evictable; + bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); + + if (!carryover && NR_STAT_GENS == 1) + return; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + + if (NR_STAT_GENS > 1) + continue; + } + + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } +} + +static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv) +{ + /* + * Allow eviction if the PV has a limited number of refaulted pages or a + * lower refault rate than the SP. + */ + return pv->refaulted < SWAP_CLUSTER_MAX || + pv->refaulted * max(sp->total, 1UL) * sp->gain <= + sp->refaulted * max(pv->total, 1UL) * pv->gain; +} + /****************************************************************************** * state change ******************************************************************************/ diff --git a/mm/workingset.c b/mm/workingset.c index 5ba3e42446fa..75dbfba773a6 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +211,116 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static int page_get_usage(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_USAGE_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + /* see the comment on MAX_NR_TIERS */ + return flags & BIT(PG_workingset) ? + (flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF : 0; +} + +/* Return a token to be stored in the shadow entry of a page being evicted. */ +static void *lru_gen_eviction(struct page *page) +{ + int hist, tier; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + int type = page_is_file_lru(page); + int usage = page_get_usage(page); + bool workingset = PageWorkingset(page); + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_USAGE_WIDTH) | usage; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_usage(usage + workingset); + atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); +} + +/* Count a refaulted page based on the token stored in its shadow entry. */ +static void lru_gen_refault(struct page *page, void *shadow) +{ + int hist, tier, usage; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = page_is_file_lru(page); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + if (page_pgdat(page) != pgdat) + return; + + rcu_read_lock(); + memcg = page_memcg_rcu(page); + if (mem_cgroup_id(memcg) != memcg_id) + goto unlock; + + usage = token & (BIT(LRU_USAGE_WIDTH) - 1); + if (usage && !workingset) + goto unlock; + + token >>= LRU_USAGE_WIDTH; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_usage(usage + workingset); + atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type); + + /* + * Tiers don't offer any protection to pages accessed via page tables. + * That's what generations do. Tiers can't fully protect pages after + * their usage has exceeded the max value. Conservatively count these + * two conditions as stalls even though they might not indicate any real + * memory pressure. + */ + if (task_in_nonseq_fault() || usage + workingset == BIT(LRU_USAGE_WIDTH)) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type); + } +unlock: + rcu_read_unlock(); +} + +#else /* CONFIG_LRU_GEN */ + +static void *lru_gen_eviction(struct page *page) +{ + return NULL; +} + +static void lru_gen_refault(struct page *page, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -264,10 +369,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); + if (lru_gen_enabled()) + return lru_gen_eviction(page); + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -296,7 +405,13 @@ void workingset_refault(struct page *page, void *shadow) bool workingset; int memcgid; + if (lru_gen_enabled()) { + lru_gen_refault(page, shadow); + return; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; rcu_read_lock(); /*

[v4,05/11] mm: multigenerational lru: protection

Commit Message

Patch