@@ -350,10 +350,8 @@ static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset);
-void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
-void workingset_activation(struct folio *folio);
/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
@@ -482,7 +482,6 @@ void folio_mark_accessed(struct folio *folio)
else
__lru_cache_activate_folio(folio);
folio_clear_referenced(folio);
- workingset_activation(folio);
}
if (folio_test_idle(folio))
folio_clear_idle(folio);
@@ -2539,8 +2539,6 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
- if (folio_test_active(folio))
- workingset_age_nonresident(lruvec, nr_pages);
}
/*
@@ -180,9 +180,10 @@
*/
#define WORKINGSET_SHIFT 1
-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
+#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
WORKINGSET_SHIFT + NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT)
+#define EVICTION_BITS (BITS_PER_LONG - (EVICTION_SHIFT))
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -226,8 +227,105 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}
-#ifdef CONFIG_LRU_GEN
+/*
+ * Get the distance reading at eviction time.
+ */
+static inline unsigned long lru_eviction(struct lruvec *lruvec,
+ int bits, int bucket_order)
+{
+ unsigned long eviction = atomic_long_read(&lruvec->nonresident_age);
+
+ eviction >>= bucket_order;
+ eviction &= ~0UL >> (BITS_PER_LONG - bits);
+
+ return eviction;
+}
+
+/*
+ * Calculate and test refault distance
+ */
+static bool lru_refault(struct mem_cgroup *memcg,
+ struct lruvec *lruvec,
+ unsigned long eviction,
+ int bits, int bucket_order)
+{
+ unsigned long refault, distance;
+ unsigned long active, inactive;
+
+ eviction <<= bucket_order;
+ refault = atomic_long_read(&lruvec->nonresident_age);
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across nonresident_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * nonresident_age to lap a shadow entry in the field, which
+ * can then result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
+ */
+ distance = (refault - eviction) & (~0UL >> (BITS_PER_LONG - bits));
+
+ active = lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+ inactive = lruvec_page_state(lruvec, NR_INACTIVE_FILE);
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ active += lruvec_page_state(lruvec, NR_ACTIVE_ANON);
+ inactive += lruvec_page_state(lruvec, NR_INACTIVE_ANON);
+ }
+
+ /*
+ * When there are already enough active pages, be less aggressive
+ * on activating pages, challenge already established workingset with
+ * one time refaulted page may not be a good idea, especially as
+ * the gap between active workingset and inactive queue grows larger.
+ */
+ if (active > inactive)
+ return distance < inactive >> (1 + (fls_long(active) - fls_long(inactive)) / 2);
+
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having free swap space.
+ */
+ return distance < active;
+}
+
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+static void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
+{
+ /*
+ * Reclaiming a cgroup means reclaiming all its children in a
+ * round-robin fashion. That means that each cgroup has an LRU
+ * order that is composed of the LRU orders of its child
+ * cgroups; and every page has an LRU position not just in the
+ * cgroup that owns it, but in all of that group's ancestors.
+ *
+ * So when the physical inactive list of a leaf cgroup ages,
+ * the virtual inactive lists of all its parents, including
+ * the root cgroup's, age as well.
+ */
+ do {
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+#ifdef CONFIG_LRU_GEN
static void *lru_gen_eviction(struct folio *folio)
{
int hist;
@@ -342,34 +440,6 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
#endif /* CONFIG_LRU_GEN */
-/**
- * workingset_age_nonresident - age non-resident entries as LRU ages
- * @lruvec: the lruvec that was aged
- * @nr_pages: the number of pages to count
- *
- * As in-memory pages are aged, non-resident pages need to be aged as
- * well, in order for the refault distances later on to be comparable
- * to the in-memory dimensions. This function allows reclaim and LRU
- * operations to drive the non-resident aging along in parallel.
- */
-void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
-{
- /*
- * Reclaiming a cgroup means reclaiming all its children in a
- * round-robin fashion. That means that each cgroup has an LRU
- * order that is composed of the LRU orders of its child
- * cgroups; and every page has an LRU position not just in the
- * cgroup that owns it, but in all of that group's ancestors.
- *
- * So when the physical inactive list of a leaf cgroup ages,
- * the virtual inactive lists of all its parents, including
- * the root cgroup's, age as well.
- */
- do {
- atomic_long_add(nr_pages, &lruvec->nonresident_age);
- } while ((lruvec = parent_lruvec(lruvec)));
-}
-
/**
* workingset_eviction - note the eviction of a folio from memory
* @target_memcg: the cgroup that is causing the reclaim
@@ -396,11 +466,11 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->nonresident_age);
- eviction >>= bucket_order;
+
+ eviction = lru_eviction(lruvec, EVICTION_BITS, bucket_order);
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
return pack_shadow(memcgid, pgdat, eviction,
- folio_test_workingset(folio));
+ folio_test_workingset(folio));
}
/**
@@ -418,9 +488,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
- unsigned long refault_distance;
- unsigned long workingset_size;
- unsigned long refault;
int memcgid;
struct pglist_data *pgdat;
unsigned long eviction;
@@ -429,7 +496,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
- eviction <<= bucket_order;
/*
* Look up the memcg associated with the stored ID. It might
@@ -450,50 +516,10 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
return false;
-
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
- refault = atomic_long_read(&eviction_lruvec->nonresident_age);
- /*
- * Calculate the refault distance
- *
- * The unsigned subtraction here gives an accurate distance
- * across nonresident_age overflows in most cases. There is a
- * special case: usually, shadow entries have a short lifetime
- * and are either refaulted or reclaimed along with the inode
- * before they get too old. But it is not impossible for the
- * nonresident_age to lap a shadow entry in the field, which
- * can then result in a false small refault distance, leading
- * to a false activation should this old entry actually
- * refault again. However, earlier kernels used to deactivate
- * unconditionally with *every* reclaim invocation for the
- * longest time, so the occasional inappropriate activation
- * leading to pressure on the active list is not a problem.
- */
- refault_distance = (refault - eviction) & EVICTION_MASK;
-
- /*
- * Compare the distance to the existing workingset size. We
- * don't activate pages that couldn't stay resident even if
- * all the memory was available to the workingset. Whether
- * workingset competition needs to consider anon or not depends
- * on having free swap space.
- */
- workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
- if (!file) {
- workingset_size += lruvec_page_state(eviction_lruvec,
- NR_INACTIVE_FILE);
- }
- if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
- workingset_size += lruvec_page_state(eviction_lruvec,
- NR_ACTIVE_ANON);
- if (file) {
- workingset_size += lruvec_page_state(eviction_lruvec,
- NR_INACTIVE_ANON);
- }
- }
-
- return refault_distance <= workingset_size;
+ return lru_refault(eviction_memcg, eviction_lruvec, eviction,
+ EVICTION_BITS, bucket_order);
}
/**
@@ -543,7 +569,6 @@ void workingset_refault(struct folio *folio, void *shadow)
goto out;
folio_set_active(folio);
- workingset_age_nonresident(lruvec, nr);
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
/* Folio was active prior to eviction */
@@ -560,30 +585,6 @@ void workingset_refault(struct folio *folio, void *shadow)
rcu_read_unlock();
}
-/**
- * workingset_activation - note a page activation
- * @folio: Folio that is being activated.
- */
-void workingset_activation(struct folio *folio)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- /*
- * Filter non-memcg pages here, e.g. unmap can call
- * mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
- */
- memcg = folio_memcg_rcu(folio);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
- rcu_read_unlock();
-}
-
/*
* Shadow entries reflect the share of the working set that does not
* fit into memory, so their number depends on the access pattern of
@@ -777,7 +778,6 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
- unsigned int timestamp_bits;
unsigned int max_order;
int ret;
@@ -789,12 +789,11 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
max_order = fls_long(totalram_pages() - 1);
- if (max_order > timestamp_bits)
- bucket_order = max_order - timestamp_bits;
+ if (max_order > EVICTION_BITS)
+ bucket_order = max_order - EVICTION_BITS;
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
- timestamp_bits, max_order, bucket_order);
+ EVICTION_BITS, max_order, bucket_order);
ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)