@@ -1778,6 +1778,25 @@ void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+
+static inline void task_enter_nonseq_fault(void)
+{
+ WARN_ON(current->in_nonseq_fault);
+
+ current->in_nonseq_fault = 1;
+}
+
+static inline void task_exit_nonseq_fault(void)
+{
+ WARN_ON(!current->in_nonseq_fault);
+
+ current->in_nonseq_fault = 0;
+}
+
+static inline bool task_in_nonseq_fault(void)
+{
+ return current->in_nonseq_fault;
+}
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
@@ -1799,6 +1818,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows) { }
+
+static inline void task_enter_nonseq_fault(void)
+{
+}
+
+static inline void task_exit_nonseq_fault(void)
+{
+}
+
+static inline bool task_in_nonseq_fault(void)
+{
+ return false;
+}
#endif
static inline void unmap_shared_mapping_range(struct address_space *mapping,
@@ -843,6 +843,9 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_MMU
+ unsigned in_nonseq_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
@@ -4752,6 +4752,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
__set_current_state(TASK_RUNNING);
@@ -4773,11 +4774,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ if (nonseq_fault)
+ task_enter_nonseq_fault();
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ if (nonseq_fault)
+ task_exit_nonseq_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
@@ -411,6 +411,43 @@ static void __lru_cache_activate_page(struct page *page)
local_unlock(&lru_pvecs.lock);
}
+#ifdef CONFIG_LRU_GEN
+static void page_inc_usage(struct page *page)
+{
+ unsigned long usage;
+ unsigned long old_flags, new_flags;
+
+ if (PageUnevictable(page))
+ return;
+
+ /* see the comment on MAX_NR_TIERS */
+ do {
+ new_flags = old_flags = READ_ONCE(page->flags);
+
+ if (!(new_flags & BIT(PG_referenced))) {
+ new_flags |= BIT(PG_referenced);
+ continue;
+ }
+
+ if (!(new_flags & BIT(PG_workingset))) {
+ new_flags |= BIT(PG_workingset);
+ continue;
+ }
+
+ usage = new_flags & LRU_USAGE_MASK;
+ usage = min(usage + BIT(LRU_USAGE_PGOFF), LRU_USAGE_MASK);
+
+ new_flags &= ~LRU_USAGE_MASK;
+ new_flags |= usage;
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+#else
+static void page_inc_usage(struct page *page)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* Mark a page as having seen activity.
*
@@ -425,6 +462,11 @@ void mark_page_accessed(struct page *page)
{
page = compound_head(page);
+ if (lru_gen_enabled()) {
+ page_inc_usage(page);
+ return;
+ }
+
if (!PageReferenced(page)) {
SetPageReferenced(page);
} else if (PageUnevictable(page)) {
@@ -468,6 +510,11 @@ void lru_cache_add(struct page *page)
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ /* see the comment in lru_gen_add_page() */
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
+ SetPageActive(page);
+
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
@@ -569,7 +616,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -684,7 +731,7 @@ void deactivate_file_page(struct page *page)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+
+ /* get a shadow entry before page_memcg() is cleared */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
@@ -2813,6 +2815,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
}
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop modeled after the PID controller. Currently supports the
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
+ * added if necessary. The setpoint (SP) is the desired position; the process
+ * variable (PV) is the measured position. The error is the difference between
+ * the SP and the PV. A positive error results in a positive control output
+ * correction, which, in our case, is to allow eviction.
+ *
+ * The P term is the refault rate of the current generation being evicted. The I
+ * term is the exponential moving average of the refault rates of the previous
+ * generations, using the smoothing factor 1/2.
+ *
+ * Our goal is to make sure upper tiers have similar refault rates as the base
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
+ * rates across them.
+ */
+struct controller_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
+ int type, int tier, int gain)
+{
+ struct lrugen *lrugen = &lruvec->evictable;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->protected[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
+{
+ int tier;
+ int hist = lru_hist_from_seq(gen);
+ struct lrugen *lrugen = &lruvec->evictable;
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (!carryover && NR_STAT_GENS == 1)
+ return;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->protected[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+
+ if (NR_STAT_GENS > 1)
+ continue;
+ }
+
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ }
+}
+
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
+{
+ /*
+ * Allow eviction if the PV has a limited number of refaulted pages or a
+ * lower refault rate than the SP.
+ */
+ return pv->refaulted < SWAP_CLUSTER_MAX ||
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
/******************************************************************************
* state change
******************************************************************************/
@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
- eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
@@ -212,10 +211,116 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
+ *evictionp = entry;
*workingsetp = workingset;
}
+#ifdef CONFIG_LRU_GEN
+
+static int page_get_usage(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_USAGE_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+ /* see the comment on MAX_NR_TIERS */
+ return flags & BIT(PG_workingset) ?
+ (flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF : 0;
+}
+
+/* Return a token to be stored in the shadow entry of a page being evicted. */
+static void *lru_gen_eviction(struct page *page)
+{
+ int hist, tier;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ int type = page_is_file_lru(page);
+ int usage = page_get_usage(page);
+ bool workingset = PageWorkingset(page);
+ struct mem_cgroup *memcg = page_memcg(page);
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_USAGE_WIDTH) | usage;
+
+ hist = lru_hist_from_seq(min_seq);
+ tier = lru_tier_from_usage(usage + workingset);
+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
+}
+
+/* Count a refaulted page based on the token stored in its shadow entry. */
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+ int hist, tier, usage;
+ int memcg_id;
+ bool workingset;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+ int type = page_is_file_lru(page);
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
+ if (page_pgdat(page) != pgdat)
+ return;
+
+ rcu_read_lock();
+ memcg = page_memcg_rcu(page);
+ if (mem_cgroup_id(memcg) != memcg_id)
+ goto unlock;
+
+ usage = token & (BIT(LRU_USAGE_WIDTH) - 1);
+ if (usage && !workingset)
+ goto unlock;
+
+ token >>= LRU_USAGE_WIDTH;
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_WIDTH)))
+ goto unlock;
+
+ hist = lru_hist_from_seq(min_seq);
+ tier = lru_tier_from_usage(usage + workingset);
+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
+
+ /*
+ * Tiers don't offer any protection to pages accessed via page tables.
+ * That's what generations do. Tiers can't fully protect pages after
+ * their usage has exceeded the max value. Conservatively count these
+ * two conditions as stalls even though they might not indicate any real
+ * memory pressure.
+ */
+ if (task_in_nonseq_fault() || usage + workingset == BIT(LRU_USAGE_WIDTH)) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
+ }
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -264,10 +369,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(page);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ eviction >>= bucket_order;
workingset_age_nonresident(lruvec, thp_nr_pages(page));
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -296,7 +405,13 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(page, shadow);
+ return;
+ }
+
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ eviction <<= bucket_order;
rcu_read_lock();
/*