diff mbox series

[v9,11/14] mm: multi-gen LRU: thrashing prevention

Message ID 20220309021230.721028-12-yuzhao@google.com (mailing list archive)
State New, archived
Headers show
Series Multi-Gen LRU Framework | expand

Commit Message

Yu Zhao March 9, 2022, 2:12 a.m. UTC
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
requested by many desktop users [1].

When set to value N, it prevents the working set of N milliseconds
from getting evicted. The OOM killer is triggered if this working set
cannot be kept in memory. Based on the average human detectable lag
(~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
Larger values like N=3000 make lags less noticeable at the risk of
premature OOM kills.

Compared with the size-based approach, e.g., [2], this time-based
approach has the following advantages:
1. It is easier to configure because it is agnostic to applications
   and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.

[1] https://lore.kernel.org/lkml/Ydza%2FzXKY9ATRoh6@google.com/
[2] https://lore.kernel.org/lkml/20211130201652.2218636d@mail.inbox.lv/

Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 include/linux/mmzone.h |  2 ++
 mm/vmscan.c            | 69 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 4 deletions(-)

Comments

Barry Song March 22, 2022, 7:22 a.m. UTC | #1
On Wed, Mar 9, 2022 at 3:48 PM Yu Zhao <yuzhao@google.com> wrote:
>
> Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
> requested by many desktop users [1].
>
> When set to value N, it prevents the working set of N milliseconds
> from getting evicted. The OOM killer is triggered if this working set
> cannot be kept in memory. Based on the average human detectable lag
> (~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
> Larger values like N=3000 make lags less noticeable at the risk of
> premature OOM kills.
>
> Compared with the size-based approach, e.g., [2], this time-based
> approach has the following advantages:
> 1. It is easier to configure because it is agnostic to applications
>    and memory sizes.
> 2. It is more reliable because it is directly wired to the OOM killer.
>

how are userspace oom daemons like android lmkd, systemd-oomd supposed
to work with this time-based oom killer?
only one of min_ttl_ms and userspace daemon should be enabled? or both
should be enabled at the same time?

> [1] https://lore.kernel.org/lkml/Ydza%2FzXKY9ATRoh6@google.com/
> [2] https://lore.kernel.org/lkml/20211130201652.2218636d@mail.inbox.lv/
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Acked-by: Brian Geffon <bgeffon@google.com>
> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
> Acked-by: Steven Barrett <steven@liquorix.net>
> Acked-by: Suleiman Souhlal <suleiman@google.com>
> Tested-by: Daniel Byrne <djbyrne@mtu.edu>
> Tested-by: Donald Carr <d@chaos-reins.com>
> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
> Tested-by: Sofia Trinh <sofia.trinh@edi.works>
> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
>  include/linux/mmzone.h |  2 ++
>  mm/vmscan.c            | 69 +++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 67 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 116c9237e401..f98f9ce50e67 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -403,6 +403,8 @@ struct lru_gen_struct {
>         unsigned long max_seq;
>         /* the eviction increments the oldest generation numbers */
>         unsigned long min_seq[ANON_AND_FILE];
> +       /* the birth time of each generation in jiffies */
> +       unsigned long timestamps[MAX_NR_GENS];
>         /* the multi-gen LRU lists */
>         struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
>         /* the sizes of the above lists */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 55cc7d6b018b..6aa083b8bb26 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4229,6 +4229,7 @@ static void inc_max_seq(struct lruvec *lruvec)
>         for (type = 0; type < ANON_AND_FILE; type++)
>                 reset_ctrl_pos(lruvec, type, false);
>
> +       WRITE_ONCE(lrugen->timestamps[next], jiffies);
>         /* make sure preceding modifications appear */
>         smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
>
> @@ -4340,7 +4341,8 @@ static long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq,
>         return total > 0 ? total : 0;
>  }
>
> -static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc,
> +                      unsigned long min_ttl)
>  {
>         bool need_aging;
>         long nr_to_scan;
> @@ -4349,14 +4351,22 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>         DEFINE_MAX_SEQ(lruvec);
>         DEFINE_MIN_SEQ(lruvec);
>
> +       if (min_ttl) {
> +               int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
> +               unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
> +
> +               if (time_is_after_jiffies(birth + min_ttl))
> +                       return false;
> +       }
> +
>         mem_cgroup_calculate_protection(NULL, memcg);
>
>         if (mem_cgroup_below_min(memcg))
> -               return;
> +               return false;
>
>         nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging);
>         if (!nr_to_scan)
> -               return;
> +               return false;
>
>         nr_to_scan >>= sc->priority;
>
> @@ -4365,11 +4375,18 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>
>         if (nr_to_scan && need_aging && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
>                 try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
> +
> +       return true;
>  }
>
> +/* to protect the working set of the last N jiffies */
> +static unsigned long lru_gen_min_ttl __read_mostly;
> +
>  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>  {
>         struct mem_cgroup *memcg;
> +       bool success = false;
> +       unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
>
>         VM_BUG_ON(!current_is_kswapd());
>
> @@ -4395,12 +4412,29 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>         do {
>                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
>
> -               age_lruvec(lruvec, sc);
> +               if (age_lruvec(lruvec, sc, min_ttl))
> +                       success = true;
>
>                 cond_resched();
>         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
>
>         current->reclaim_state->mm_walk = NULL;
> +
> +       /*
> +        * The main goal is to OOM kill if every generation from all memcgs is
> +        * younger than min_ttl. However, another theoretical possibility is all
> +        * memcgs are either below min or empty.
> +        */
> +       if (!success && mutex_trylock(&oom_lock)) {
> +               struct oom_control oc = {
> +                       .gfp_mask = sc->gfp_mask,
> +                       .order = sc->order,
> +               };
> +
> +               out_of_memory(&oc);
> +
> +               mutex_unlock(&oom_lock);
> +       }
>  }
>
>  /*
> @@ -5112,6 +5146,28 @@ static void lru_gen_change_state(bool enable)
>   *                          sysfs interface
>   ******************************************************************************/
>
> +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +       return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
> +}
> +
> +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
> +                            const char *buf, size_t len)
> +{
> +       unsigned int msecs;
> +
> +       if (kstrtouint(buf, 0, &msecs))
> +               return -EINVAL;
> +
> +       WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
> +
> +       return len;
> +}
> +
> +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
> +       min_ttl_ms, 0644, show_min_ttl, store_min_ttl
> +);
> +
>  static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
>  {
>         unsigned int caps = 0;
> @@ -5160,6 +5216,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
>  );
>
>  static struct attribute *lru_gen_attrs[] = {
> +       &lru_gen_min_ttl_attr.attr,
>         &lru_gen_enabled_attr.attr,
>         NULL
>  };
> @@ -5175,12 +5232,16 @@ static struct attribute_group lru_gen_attr_group = {
>
>  void lru_gen_init_lruvec(struct lruvec *lruvec)
>  {
> +       int i;
>         int gen, type, zone;
>         struct lru_gen_struct *lrugen = &lruvec->lrugen;
>
>         lrugen->max_seq = MIN_NR_GENS + 1;
>         lrugen->enabled = lru_gen_enabled();
>
> +       for (i = 0; i <= MIN_NR_GENS + 1; i++)
> +               lrugen->timestamps[i] = jiffies;
> +
>         for_each_gen_type_zone(gen, type, zone)
>                 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
>
> --
> 2.35.1.616.g0bdcbb4464-goog
>

Thanks
Barry
Yu Zhao March 22, 2022, 8:14 a.m. UTC | #2
On Tue, Mar 22, 2022 at 1:23 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Mar 9, 2022 at 3:48 PM Yu Zhao <yuzhao@google.com> wrote:
> >
> > Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
> > requested by many desktop users [1].
> >
> > When set to value N, it prevents the working set of N milliseconds
> > from getting evicted. The OOM killer is triggered if this working set
> > cannot be kept in memory. Based on the average human detectable lag
> > (~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
> > Larger values like N=3000 make lags less noticeable at the risk of
> > premature OOM kills.
> >
> > Compared with the size-based approach, e.g., [2], this time-based
> > approach has the following advantages:
> > 1. It is easier to configure because it is agnostic to applications
> >    and memory sizes.
> > 2. It is more reliable because it is directly wired to the OOM killer.
> >
>
> how are userspace oom daemons like android lmkd, systemd-oomd supposed
> to work with this time-based oom killer?
> only one of min_ttl_ms and userspace daemon should be enabled? or both
> should be enabled at the same time?

Generally we just need one. lmkd and oomd are more flexible but 1)
they need customizations 2) not all distros have them 3) they might be
stuck in direct reclaim as well.

The last remark is not just a theoretical problem:
a) we had many servers under extremely heavy (global) memory pressure,
that 200+ direct reclaimers on each CPU competed for resources and
userspace livelocked for 2 hours. Eventually hardware watchdogs kicked
in.
b) on Chromebooks we have something similar to lmkd, and we still
frequently observe crashes due to heavy memory pressure, meaning some
Chrome tabs were stuck in direct reclaim for 120 seconds
(hung_task_timeout_secs=120).
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 116c9237e401..f98f9ce50e67 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -403,6 +403,8 @@  struct lru_gen_struct {
 	unsigned long max_seq;
 	/* the eviction increments the oldest generation numbers */
 	unsigned long min_seq[ANON_AND_FILE];
+	/* the birth time of each generation in jiffies */
+	unsigned long timestamps[MAX_NR_GENS];
 	/* the multi-gen LRU lists */
 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 	/* the sizes of the above lists */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 55cc7d6b018b..6aa083b8bb26 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4229,6 +4229,7 @@  static void inc_max_seq(struct lruvec *lruvec)
 	for (type = 0; type < ANON_AND_FILE; type++)
 		reset_ctrl_pos(lruvec, type, false);
 
+	WRITE_ONCE(lrugen->timestamps[next], jiffies);
 	/* make sure preceding modifications appear */
 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
 
@@ -4340,7 +4341,8 @@  static long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq,
 	return total > 0 ? total : 0;
 }
 
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc,
+		       unsigned long min_ttl)
 {
 	bool need_aging;
 	long nr_to_scan;
@@ -4349,14 +4351,22 @@  static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	DEFINE_MAX_SEQ(lruvec);
 	DEFINE_MIN_SEQ(lruvec);
 
+	if (min_ttl) {
+		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+		if (time_is_after_jiffies(birth + min_ttl))
+			return false;
+	}
+
 	mem_cgroup_calculate_protection(NULL, memcg);
 
 	if (mem_cgroup_below_min(memcg))
-		return;
+		return false;
 
 	nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging);
 	if (!nr_to_scan)
-		return;
+		return false;
 
 	nr_to_scan >>= sc->priority;
 
@@ -4365,11 +4375,18 @@  static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 
 	if (nr_to_scan && need_aging && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
 		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+
+	return true;
 }
 
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
 	struct mem_cgroup *memcg;
+	bool success = false;
+	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
 
 	VM_BUG_ON(!current_is_kswapd());
 
@@ -4395,12 +4412,29 @@  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
-		age_lruvec(lruvec, sc);
+		if (age_lruvec(lruvec, sc, min_ttl))
+			success = true;
 
 		cond_resched();
 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 
 	current->reclaim_state->mm_walk = NULL;
+
+	/*
+	 * The main goal is to OOM kill if every generation from all memcgs is
+	 * younger than min_ttl. However, another theoretical possibility is all
+	 * memcgs are either below min or empty.
+	 */
+	if (!success && mutex_trylock(&oom_lock)) {
+		struct oom_control oc = {
+			.gfp_mask = sc->gfp_mask,
+			.order = sc->order,
+		};
+
+		out_of_memory(&oc);
+
+		mutex_unlock(&oom_lock);
+	}
 }
 
 /*
@@ -5112,6 +5146,28 @@  static void lru_gen_change_state(bool enable)
  *                          sysfs interface
  ******************************************************************************/
 
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t len)
+{
+	unsigned int msecs;
+
+	if (kstrtouint(buf, 0, &msecs))
+		return -EINVAL;
+
+	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+	return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
 static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	unsigned int caps = 0;
@@ -5160,6 +5216,7 @@  static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
 );
 
 static struct attribute *lru_gen_attrs[] = {
+	&lru_gen_min_ttl_attr.attr,
 	&lru_gen_enabled_attr.attr,
 	NULL
 };
@@ -5175,12 +5232,16 @@  static struct attribute_group lru_gen_attr_group = {
 
 void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
+	int i;
 	int gen, type, zone;
 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled = lru_gen_enabled();
 
+	for (i = 0; i <= MIN_NR_GENS + 1; i++)
+		lrugen->timestamps[i] = jiffies;
+
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);