@@ -2467,6 +2467,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc,
*denominator = ap + fp;
}
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2537,70 +2600,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -4521,8 +4524,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4531,7 +4535,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4642,7 +4646,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
@@ -4654,7 +4659,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
*type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
return scanned;
@@ -4664,7 +4669,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4683,7 +4689,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
@@ -4804,6 +4810,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
@@ -4856,7 +4864,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -5477,7 +5485,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(MAX_LRU_BATCH, lruvec, sc, swappiness))
return 0;
cond_resched();
The scan implementation for MGLRU was missing proportional reclaim pressure for memcg, which contradicts the description in Documentation/admin-guide/cgroup-v2.rst (memory.{low,min} section). This issue was revealed by the LTP memcontrol03 [1] test case. The following example output from a local test env with no NUMA shows that prior to this patch, proportional protection was not working: * Without this patch (MGLRU enabled): $ sudo LTP_SINGLE_FS_TYPE=xfs ./memcontrol03 ... memcontrol03.c:214: TPASS: Expect: (A/B/C memory.current=25964544) ~= 34603008 memcontrol03.c:216: TPASS: Expect: (A/B/D memory.current=26038272) ~= 17825792 ... * With this patch (MGLRU enabled): $ sudo LTP_SINGLE_FS_TYPE=xfs ./memcontrol03 ... memcontrol03.c:214: TPASS: Expect: (A/B/C memory.current=29327360) ~= 34603008 memcontrol03.c:216: TPASS: Expect: (A/B/D memory.current=23748608) ~= 17825792 ... * When MGLRU is disabled: $ sudo LTP_SINGLE_FS_TYPE=xfs ./memcontrol03 ... memcontrol03.c:214: TPASS: Expect: (A/B/C memory.current=28819456) ~= 34603008 memcontrol03.c:216: TPASS: Expect: (A/B/D memory.current=24018944) ~= 17825792 ... Note that the test shows TPASS for all cases here due to its lenient criteria. And even with this patch, or when MGLRU is disabled, the results above show slight deviation from the expected values, but this is due to relatively small mem usage compared to the >> DEF_PRIORITY adjustment. Factor out the proportioning logic to a new function and have MGLRU reuse it. [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/controllers/memcg/memcontrol03.c Signed-off-by: Koichiro Den <koichiro.den@canonical.com> --- mm/vmscan.c | 148 +++++++++++++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 70 deletions(-)