diff mbox series

[v2,2/4] mm/oom: handle remote ooms

Message ID 20211110211951.3730787-3-almasrymina@google.com (mailing list archive)
State New
Headers show
Series [v2,1/4] mm/shmem: support deterministic charging of tmpfs | expand

Commit Message

Mina Almasry Nov. 10, 2021, 9:19 p.m. UTC
On remote ooms (OOMs due to remote charging), the oom-killer will attempt
to find a task to kill in the memcg under oom, if the oom-killer
is unable to find one, the oom-killer should simply return ENOMEM to the
allocating process.

If we're in pagefault path and we're unable to return ENOMEM to the
allocating process, we instead kill the allocating process.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v2:
- Moved the remote oom handling as Roman requested.
- Used mem_cgroup_from_task(current) instead of grabbing the memcg from
current->mm

---
 include/linux/memcontrol.h |  6 ++++++
 mm/memcontrol.c            | 29 +++++++++++++++++++++++++++++
 mm/oom_kill.c              | 22 ++++++++++++++++++++++
 3 files changed, 57 insertions(+)

--
2.34.0.rc0.344.g81b53c2807-goog

Comments

kernel test robot Nov. 11, 2021, 5:15 p.m. UTC | #1
Hi Mina,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-mm/master]

url:    https://github.com/0day-ci/linux/commits/Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211111-062122
base:   https://github.com/hnaz/linux-mm master
config: um-i386_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/452a4d110272eb39892e4be30526411c7a5cb2e3
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211111-062122
        git checkout 452a4d110272eb39892e4be30526411c7a5cb2e3
        # save the attached .config to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/oom_kill.c: In function 'out_of_memory':
>> mm/oom_kill.c:1116:15: error: 'struct task_struct' has no member named 'in_user_fault'
    1116 |    if (current->in_user_fault &&
         |               ^~


vim +1116 mm/oom_kill.c

  1044	
  1045	/**
  1046	 * out_of_memory - kill the "best" process when we run out of memory
  1047	 * @oc: pointer to struct oom_control
  1048	 *
  1049	 * If we run out of memory, we have the choice between either
  1050	 * killing a random task (bad), letting the system crash (worse)
  1051	 * OR try to be smart about which process to kill. Note that we
  1052	 * don't have to be perfect here, we just have to be good.
  1053	 */
  1054	bool out_of_memory(struct oom_control *oc)
  1055	{
  1056		unsigned long freed = 0;
  1057	
  1058		if (oom_killer_disabled)
  1059			return false;
  1060	
  1061		if (!is_memcg_oom(oc)) {
  1062			blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
  1063			if (freed > 0)
  1064				/* Got some memory back in the last second. */
  1065				return true;
  1066		}
  1067	
  1068		/*
  1069		 * If current has a pending SIGKILL or is exiting, then automatically
  1070		 * select it.  The goal is to allow it to allocate so that it may
  1071		 * quickly exit and free its memory.
  1072		 */
  1073		if (task_will_free_mem(current)) {
  1074			mark_oom_victim(current);
  1075			wake_oom_reaper(current);
  1076			return true;
  1077		}
  1078	
  1079		/*
  1080		 * The OOM killer does not compensate for IO-less reclaim.
  1081		 * pagefault_out_of_memory lost its gfp context so we have to
  1082		 * make sure exclude 0 mask - all other users should have at least
  1083		 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
  1084		 * invoke the OOM killer even if it is a GFP_NOFS allocation.
  1085		 */
  1086		if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
  1087			return true;
  1088	
  1089		/*
  1090		 * Check if there were limitations on the allocation (only relevant for
  1091		 * NUMA and memcg) that may require different handling.
  1092		 */
  1093		oc->constraint = constrained_alloc(oc);
  1094		if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
  1095			oc->nodemask = NULL;
  1096		check_panic_on_oom(oc);
  1097	
  1098		if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
  1099		    current->mm && !oom_unkillable_task(current) &&
  1100		    oom_cpuset_eligible(current, oc) &&
  1101		    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
  1102			get_task_struct(current);
  1103			oc->chosen = current;
  1104			oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
  1105			return true;
  1106		}
  1107	
  1108		select_bad_process(oc);
  1109		/* Found nothing?!?! */
  1110		if (!oc->chosen) {
  1111			if (is_remote_oom(oc->memcg)) {
  1112				/*
  1113				 * For remote ooms in userfaults, we have no choice but
  1114				 * to kill the allocating process.
  1115				 */
> 1116				if (current->in_user_fault &&
  1117				    !oom_unkillable_task(current)) {
  1118					get_task_struct(current);
  1119					oc->chosen = current;
  1120					oom_kill_process(
  1121						oc,
  1122						"Out of memory (Killing remote allocating task)");
  1123					return true;
  1124				}
  1125	
  1126				/*
  1127				 * For remote ooms in non-userfaults, simply return
  1128				 * ENOMEM to the caller.
  1129				 */
  1130				return false;
  1131			}
  1132	
  1133			dump_header(oc, NULL);
  1134			pr_warn("Out of memory and no killable processes...\n");
  1135			/*
  1136			 * If we got here due to an actual allocation at the
  1137			 * system level, we cannot survive this and will enter
  1138			 * an endless loop in the allocator. Bail out now.
  1139			 */
  1140			if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
  1141				panic("System is deadlocked on memory\n");
  1142		}
  1143		if (oc->chosen && oc->chosen != (void *)-1UL)
  1144			oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
  1145					 "Memory cgroup out of memory");
  1146		return !!oc->chosen;
  1147	}
  1148	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 866904afd3563..ae4686abd4d32 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -937,6 +937,7 @@  struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
  * it.
  */
 int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom);

 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);
@@ -1270,6 +1271,11 @@  static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
 	return 0;
 }

+static inline bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	return false;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3d8f52a63d17..8019c396bfdd9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2664,6 +2664,35 @@  int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
 	return ret < 0 ? ret : 0;
 }

+/*
+ * Returns true if current's mm is a descendant of the memcg_under_oom (or
+ * equal to it). False otherwise. This is used by the oom-killer to detect
+ * ooms due to remote charging.
+ */
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	struct mem_cgroup *current_memcg;
+	bool is_remote_oom;
+
+	if (!memcg_under_oom)
+		return false;
+
+	rcu_read_lock();
+	current_memcg = mem_cgroup_from_task(current);
+	if (current_memcg && !css_tryget_online(&current_memcg->css))
+		current_memcg = NULL;
+	rcu_read_unlock();
+
+	if (!current_memcg)
+		return false;
+
+	is_remote_oom =
+		!mem_cgroup_is_descendant(current_memcg, memcg_under_oom);
+	css_put(&current_memcg->css);
+
+	return is_remote_oom;
+}
+
 /*
  * Set or clear (if @memcg is NULL) charge association from file system to
  * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a7e16b16b8c3..0e0097a0aed45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1108,6 +1108,28 @@  bool out_of_memory(struct oom_control *oc)
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
+		if (is_remote_oom(oc->memcg)) {
+			/*
+			 * For remote ooms in userfaults, we have no choice but
+			 * to kill the allocating process.
+			 */
+			if (current->in_user_fault &&
+			    !oom_unkillable_task(current)) {
+				get_task_struct(current);
+				oc->chosen = current;
+				oom_kill_process(
+					oc,
+					"Out of memory (Killing remote allocating task)");
+				return true;
+			}
+
+			/*
+			 * For remote ooms in non-userfaults, simply return
+			 * ENOMEM to the caller.
+			 */
+			return false;
+		}
+
 		dump_header(oc, NULL);
 		pr_warn("Out of memory and no killable processes...\n");
 		/*