[0/8] OOM killer/reaper changes for avoiding OOM lockup problem.
diff mbox

Message ID 201807100357.w6A3vv5o062894@www262.sakura.ne.jp
State New
Headers show

Commit Message

Tetsuo Handa July 10, 2018, 3:57 a.m. UTC
> On Fri 06-07-18 11:40:07, Tetsuo Handa wrote:
> > > > > > Of course, we don't have to remove the OOM reaper kernel thread.
> > > > > 
> > > > > The thing is that the current design uses the oom_reaper only as a
> > > > > backup to get situation unstuck. Once you move all that heavy lifting
> > > > > into the oom path directly then you will have to handle all sorts of
> > > > > issues. E.g. how do you handle that a random process hitting OOM path
> > > > > has to pay the full price to tear down multi TB process? This is a lot
> > > > > of time.
> > > > 
> > > > We can add a threshold to unmap_page_range() (for direct OOM reaping threads)
> > > > which aborts after given number of pages are reclaimed. There is no need to
> > > > reclaim all pages at once if the caller is doing memory allocations. 
> > > 
> > > Yes, there is no need to reclaim all pages. OOM is after freeing _some_
> > > memory after all. But that means further complications down the unmap
> > > path. I do not really see any reason for that.
> > 
> > "I do not see reason for that" cannot become a reason direct OOM reaping has to
> > reclaim all pages at once.
> 
> We are not going to polute deep mm guts for unlikely events like oom.
> 
As far as I tested, below approach does not pollute deep mm guts. It should
achieve what David wants to do, without introducing user-visible tunable
interface.

David, can you try these patches?


From e92d8095d4e6fca1816077e3ad7eaa4c8d2c8825 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 10 Jul 2018 09:58:05 +0900
Subject: [PATCH 1/2] mm,oom: Check pending victims earlier in out_of_memory().

The "mm, oom: cgroup-aware OOM killer" patchset is trying to introduce
INFLIGHT_VICTIM in order to replace open-coded ((void *)-1UL). But
(regarding CONFIG_MMU=y case) we have a list of inflight OOM victim
threads which are connected to oom_reaper_list. Thus we can check
whether there are inflight OOM victims before starting process/memcg
list traversal. Since it is likely that only few threads are linked to
oom_reaper_list, checking all victims' OOM domain will not matter.

Thus, check whether there are inflight OOM victims before starting
process/memcg list traversal and eliminate the "abort" path.

Note that this patch could temporarily regress CONFIG_MMU=n kernels
because this patch selects same victims rather than waits for victims
if CONFIG_MMU=n. This will be fixed by the next patch.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 include/linux/memcontrol.h |   9 ++--
 include/linux/sched.h      |   2 +-
 mm/memcontrol.c            |  18 ++-----
 mm/oom_kill.c              | 130 +++++++++++++++++++++++++--------------------
 4 files changed, 81 insertions(+), 78 deletions(-)

Patch
diff mbox

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..a82360a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -382,8 +382,8 @@  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
-int mem_cgroup_scan_tasks(struct mem_cgroup *,
-			  int (*)(struct task_struct *, void *), void *);
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg);
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
@@ -850,10 +850,9 @@  static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 {
 }
 
-static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-		int (*fn)(struct task_struct *, void *), void *arg)
+static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+		void (*fn)(struct task_struct *, void *), void *arg)
 {
-	return 0;
 }
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe..3214ccf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1163,7 +1163,7 @@  struct task_struct {
 #endif
 	int				pagefault_disabled;
 #ifdef CONFIG_MMU
-	struct task_struct		*oom_reaper_list;
+	struct list_head		oom_victim_list;
 #endif
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5e..c8a75c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -884,17 +884,14 @@  static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  * @arg: argument passed to @fn
  *
  * This function iterates over tasks attached to @memcg or to any of its
- * descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * descendants and calls @fn for each task.
  *
  * This function must not be called for the root memory cgroup.
  */
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-			  int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg)
 {
 	struct mem_cgroup *iter;
-	int ret = 0;
 
 	BUG_ON(memcg == root_mem_cgroup);
 
@@ -903,15 +900,10 @@  int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 		struct task_struct *task;
 
 		css_task_iter_start(&iter->css, 0, &it);
-		while (!ret && (task = css_task_iter_next(&it)))
-			ret = fn(task, arg);
+		while ((task = css_task_iter_next(&it)))
+			fn(task, arg);
 		css_task_iter_end(&it);
-		if (ret) {
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		}
 	}
-	return ret;
 }
 
 /**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index cd6520f..14260a6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -304,25 +304,13 @@  static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+static void oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
 	unsigned long points;
 
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
-		goto next;
-
-	/*
-	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves unless
-	 * the task has MMF_OOM_SKIP because chances that it would release
-	 * any memory is quite low.
-	 */
-	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
-			goto next;
-		goto abort;
-	}
+		return;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -335,29 +323,22 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 
 	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
 	if (!points || points < oc->chosen_points)
-		goto next;
+		return;
 
 	/* Prefer thread group leaders for display purposes */
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
-		goto next;
+		return;
 select:
 	if (oc->chosen)
 		put_task_struct(oc->chosen);
 	get_task_struct(task);
 	oc->chosen = task;
 	oc->chosen_points = points;
-next:
-	return 0;
-abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	oc->chosen = (void *)-1UL;
-	return 1;
 }
 
 /*
  * Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'.
  */
 static void select_bad_process(struct oom_control *oc)
 {
@@ -368,8 +349,7 @@  static void select_bad_process(struct oom_control *oc)
 
 		rcu_read_lock();
 		for_each_process(p)
-			if (oom_evaluate_task(p, oc))
-				break;
+			oom_evaluate_task(p, oc);
 		rcu_read_unlock();
 	}
 
@@ -476,9 +456,24 @@  bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
  */
 static struct task_struct *oom_reaper_th;
 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
+static LIST_HEAD(oom_victim_list);
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
+/*
+ * We have to make sure not to cause premature new oom victim selection.
+ *
+ * __alloc_pages_may_oom()     oom_reap_task_mm()/exit_mmap()
+ *   mutex_trylock(&oom_lock)
+ *   get_page_from_freelist(ALLOC_WMARK_HIGH) # fails
+ *                               unmap_page_range() # frees some memory
+ *                               set_bit(MMF_OOM_SKIP)
+ *   out_of_memory()
+ *     oom_has_pending_victims()
+ *       test_bit(MMF_OOM_SKIP) # selects new oom victim
+ *   mutex_unlock(&oom_lock)
+ *
+ * Therefore, the callers hold oom_lock when calling this function.
+ */
 void __oom_reap_task_mm(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -523,20 +518,6 @@  static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	bool ret = true;
 
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
 	mutex_lock(&oom_lock);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
@@ -605,14 +586,16 @@  static void oom_reap_task(struct task_struct *tsk)
 	debug_show_all_locks();
 
 done:
-	tsk->oom_reaper_list = NULL;
-
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
 
+	spin_lock(&oom_reaper_lock);
+	list_del(&tsk->oom_victim_list);
+	spin_unlock(&oom_reaper_lock);
+
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);
 }
@@ -622,12 +605,13 @@  static int oom_reaper(void *unused)
 	while (true) {
 		struct task_struct *tsk = NULL;
 
-		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+		wait_event_freezable(oom_reaper_wait,
+				     !list_empty(&oom_victim_list));
 		spin_lock(&oom_reaper_lock);
-		if (oom_reaper_list != NULL) {
-			tsk = oom_reaper_list;
-			oom_reaper_list = tsk->oom_reaper_list;
-		}
+		if (!list_empty(&oom_victim_list))
+			tsk = list_first_entry(&oom_victim_list,
+					       struct task_struct,
+					       oom_victim_list);
 		spin_unlock(&oom_reaper_lock);
 
 		if (tsk)
@@ -639,15 +623,11 @@  static int oom_reaper(void *unused)
 
 static void wake_oom_reaper(struct task_struct *tsk)
 {
-	/* tsk is already queued? */
-	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+	if (tsk->oom_victim_list.next)
 		return;
-
 	get_task_struct(tsk);
-
 	spin_lock(&oom_reaper_lock);
-	tsk->oom_reaper_list = oom_reaper_list;
-	oom_reaper_list = tsk;
+	list_add_tail(&tsk->oom_victim_list, &oom_victim_list);
 	spin_unlock(&oom_reaper_lock);
 	trace_wake_reaper(tsk->pid);
 	wake_up(&oom_reaper_wait);
@@ -1009,6 +989,34 @@  int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool oom_has_pending_victims(struct oom_control *oc)
+{
+#ifdef CONFIG_MMU
+	struct task_struct *p;
+
+	if (is_sysrq_oom(oc))
+		return false;
+	/*
+	 * Since oom_reap_task_mm()/exit_mmap() will set MMF_OOM_SKIP, let's
+	 * wait for pending victims until MMF_OOM_SKIP is set.
+	 */
+	spin_lock(&oom_reaper_lock);
+	list_for_each_entry(p, &oom_victim_list, oom_victim_list)
+		if (!oom_unkillable_task(p, oc->memcg, oc->nodemask) &&
+		    !test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags))
+			break;
+	spin_unlock(&oom_reaper_lock);
+	return p != NULL;
+#else
+	/*
+	 * Since nobody except oom_kill_process() sets MMF_OOM_SKIP, waiting
+	 * for pending victims until MMF_OOM_SKIP is set is useless. Therefore,
+	 * simply let the OOM killer select pending victims again.
+	 */
+	return false;
+#endif
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1062,6 +1070,9 @@  bool out_of_memory(struct oom_control *oc)
 		oc->nodemask = NULL;
 	check_panic_on_oom(oc, constraint);
 
+	if (oom_has_pending_victims(oc))
+		return true;
+
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
@@ -1073,14 +1084,15 @@  bool out_of_memory(struct oom_control *oc)
 
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+	if (!oc->chosen) {
+		if (is_sysrq_oom(oc) || is_memcg_oom(oc))
+			return false;
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL)
-		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-				 "Memory cgroup out of memory");
-	return !!oc->chosen;
+	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+			 "Memory cgroup out of memory");
+	return true;
 }
 
 /*
-- 
1.8.3.1



From 9dd8a94651fdc43b2edf7b4de88d6271a12a0484 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 10 Jul 2018 12:19:13 +0900
Subject: [PATCH 2/2] mm,oom: Introduce direct OOM reaping.

We are about to start getting a lot of OOM victim processes by introducing
"mm, oom: cgroup-aware OOM killer" patchset. When there are multiple OOM
victims, we should try reclaiming memory in parallel rather than
sequentially wait until conditions to be able to reclaim memory are met.
Also, we should preferentially reclaim from OOM domains where currently
allocating threads belong to. Therefore, this patch gets rid of the OOM
reaper kernel thread and introduces direct OOM reaping.

Also, David Rientjes is complaining that memcg OOM events needlessly
select more OOM victims when the OOM reaper was unable to reclaim memory.
This is because exit_mmap() is setting MMF_OOM_SKIP before calling
free_pgtables(). Therefore, this patch also introduces OOM-badness-score
based hold off approach.

This patch changes the OOM killer to wait until either __mmput() completes
or OOM badness score did not decrease for 3 seconds. This patch changes the
meaning of MMF_OOM_SKIP changes from "start selecting next OOM victim" to
"stop calling oom_reap_mm() from direct OOM reaping". In order to allow
direct OOM reaping, counter variables which remember "the current OOM
badness score", "when OOM badness score was compared for the last time",
"how many times OOM badness score did not decrease" are added to
"struct task_struct".

Also, since direct OOM reaping does not need to wait for complete teardown
of an OOM victim (which could be consuming multiple TB memory), dircet OOM
reaping tries to reclaim only up to 128 pages. This is done by calling
unmap_page_range() with splitted address ranges.

Note that four tracepoints which are used for recording the OOM reaper is
removed because the OOM reaper kernel thread is removed and direct OOM
reaping can generate too many trace logs.

As a side note, the potential regression for CONFIG_MMU=n kernels is
fixed by doing the same things except oom_reap_mm(), for there is now
no difference between CONFIG_MMU=y kernels and CONFIG_MMU=n kernels
except that CONFIG_MMU=n kernels cannot implement oom_reap_mm(). But
CONFIG_MMU=n kernels can save memory for one kernel thread.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 include/linux/oom.h        |   3 +-
 include/linux/sched.h      |   5 +-
 include/trace/events/oom.h |  64 --------
 kernel/fork.c              |   2 +
 mm/mmap.c                  |  17 +--
 mm/oom_kill.c              | 363 ++++++++++++++++++++-------------------------
 6 files changed, 173 insertions(+), 281 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6adac11..abfc405 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -95,7 +95,7 @@  static inline int check_stable_address_space(struct mm_struct *mm)
 	return 0;
 }
 
-void __oom_reap_task_mm(struct mm_struct *mm);
+extern bool oom_reap_mm(struct mm_struct *mm, const bool exiting);
 
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -104,6 +104,7 @@  extern unsigned long oom_badness(struct task_struct *p,
 extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(void);
+extern void exit_oom_mm(struct mm_struct *mm);
 
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3214ccf..be29949 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1162,9 +1162,10 @@  struct task_struct {
 	unsigned long			task_state_change;
 #endif
 	int				pagefault_disabled;
-#ifdef CONFIG_MMU
 	struct list_head		oom_victim_list;
-#endif
+	unsigned long			last_oom_compared;
+	unsigned long			last_oom_score;
+	unsigned char			oom_reap_stall_count;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4..5228fc2 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -87,70 +87,6 @@ 
 	TP_printk("pid=%d", __entry->pid)
 );
 
-TRACE_EVENT(wake_reaper,
-	TP_PROTO(int pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__field(int, pid)
-	),
-
-	TP_fast_assign(
-		__entry->pid = pid;
-	),
-
-	TP_printk("pid=%d", __entry->pid)
-);
-
-TRACE_EVENT(start_task_reaping,
-	TP_PROTO(int pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__field(int, pid)
-	),
-
-	TP_fast_assign(
-		__entry->pid = pid;
-	),
-
-	TP_printk("pid=%d", __entry->pid)
-);
-
-TRACE_EVENT(finish_task_reaping,
-	TP_PROTO(int pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__field(int, pid)
-	),
-
-	TP_fast_assign(
-		__entry->pid = pid;
-	),
-
-	TP_printk("pid=%d", __entry->pid)
-);
-
-TRACE_EVENT(skip_task_reaping,
-	TP_PROTO(int pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__field(int, pid)
-	),
-
-	TP_fast_assign(
-		__entry->pid = pid;
-	),
-
-	TP_printk("pid=%d", __entry->pid)
-);
-
 #ifdef CONFIG_COMPACTION
 TRACE_EVENT(compact_retry,
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61..63a0bc6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -977,6 +977,8 @@  static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	if (unlikely(mm_is_oom_victim(mm)))
+		exit_oom_mm(mm);
 	mmdrop(mm);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index d1eb87e..b0bf9c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3059,24 +3059,19 @@  void exit_mmap(struct mm_struct *mm)
 	if (unlikely(mm_is_oom_victim(mm))) {
 		/*
 		 * Manually reap the mm to free as much memory as possible.
-		 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
-		 * this mm from further consideration.  Taking mm->mmap_sem for
-		 * write after setting MMF_OOM_SKIP will guarantee that the oom
-		 * reaper will not run on this mm again after mmap_sem is
-		 * dropped.
+		 * Then, tell oom_has_pending_victims() to stop calling
+		 * oom_reap_mm(), by taking mm->mmap_sem for write after
+		 * setting MMF_OOM_SKIP.
 		 *
 		 * Nothing can be holding mm->mmap_sem here and the above call
 		 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
-		 * __oom_reap_task_mm() will not block.
+		 * oom_reap_mm() will not block.
 		 *
 		 * This needs to be done before calling munlock_vma_pages_all(),
-		 * which clears VM_LOCKED, otherwise the oom reaper cannot
+		 * which clears VM_LOCKED, otherwise oom_reap_mm() cannot
 		 * reliably test it.
 		 */
-		mutex_lock(&oom_lock);
-		__oom_reap_task_mm(mm);
-		mutex_unlock(&oom_lock);
-
+		oom_reap_mm(mm, true);
 		set_bit(MMF_OOM_SKIP, &mm->flags);
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14260a6..11ea43f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -49,6 +49,12 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -198,6 +204,9 @@  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	long points;
 	long adj;
 
+	if (tsk_is_oom_victim(p))
+		return 0;
+
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
 
@@ -207,13 +216,10 @@  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 
 	/*
 	 * Do not even consider tasks which are explicitly marked oom
-	 * unkillable or have been already oom reaped or the are in
-	 * the middle of vfork
+	 * unkillable or are in the middle of vfork
 	 */
 	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN ||
-			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
-			in_vfork(p)) {
+	if (adj == OOM_SCORE_ADJ_MIN || in_vfork(p)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -222,8 +228,7 @@  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
@@ -431,6 +436,29 @@  static void dump_header(struct oom_control *oc, struct task_struct *p)
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
+{
+	unsigned long score;
+
+	if (time_before(jiffies, p->last_oom_compared + HZ))
+		return false;
+	score = oom_victim_mm_score(mm);
+	if (score < p->last_oom_score)
+		p->oom_reap_stall_count = 0;
+	else
+		p->oom_reap_stall_count++;
+	p->last_oom_score = oom_victim_mm_score(mm);
+	p->last_oom_compared = jiffies;
+	if (p->oom_reap_stall_count < 3)
+		return false;
+	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(p), p->comm, K(mm->total_vm),
+		K(get_mm_counter(mm, MM_ANONPAGES)),
+		K(get_mm_counter(mm, MM_FILEPAGES)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES)));
+	return true;
+}
+
 /*
  * task->mm can be NULL if the task is the exited group leader.  So to
  * determine whether the task is using a particular mm, we examine all the
@@ -449,34 +477,59 @@  bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 	return false;
 }
 
-#ifdef CONFIG_MMU
-/*
- * OOM Reaper kernel thread which tries to reap the memory used by the OOM
- * victim (if that is possible) to help the OOM killer to move on.
- */
-static struct task_struct *oom_reaper_th;
-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static LIST_HEAD(oom_victim_list);
-static DEFINE_SPINLOCK(oom_reaper_lock);
 
-/*
- * We have to make sure not to cause premature new oom victim selection.
- *
- * __alloc_pages_may_oom()     oom_reap_task_mm()/exit_mmap()
- *   mutex_trylock(&oom_lock)
- *   get_page_from_freelist(ALLOC_WMARK_HIGH) # fails
- *                               unmap_page_range() # frees some memory
- *                               set_bit(MMF_OOM_SKIP)
- *   out_of_memory()
- *     oom_has_pending_victims()
- *       test_bit(MMF_OOM_SKIP) # selects new oom victim
- *   mutex_unlock(&oom_lock)
- *
- * Therefore, the callers hold oom_lock when calling this function.
- */
-void __oom_reap_task_mm(struct mm_struct *mm)
+#ifdef CONFIG_MMU
+static bool __oom_reap_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+			   const bool exiting)
+{
+	struct mmu_gather tlb;
+	unsigned long start = vma->vm_start;
+	const unsigned long end = vma->vm_end;
+	unsigned long score;
+	unsigned long len;
+
+	if (exiting) {
+		/*
+		 * Try to reclaim all memory, for current thread is in
+		 * exit_mmap() without oom_lock held.
+		 */
+		tlb_gather_mmu(&tlb, mm, start, end);
+		mmu_notifier_invalidate_range_start(mm, start, end);
+		unmap_page_range(&tlb, vma, start, end, NULL);
+		mmu_notifier_invalidate_range_end(mm, start, end);
+		tlb_finish_mmu(&tlb, start, end);
+		return false;
+	}
+	/*
+	 * Try to reclaim only small amount of memory, for current thread is
+	 * doing memory allocation request with oom_lock held.
+	 */
+	score = oom_victim_mm_score(mm);
+	while (start < end) {
+		if (end - start > 128 * PAGE_SIZE)
+			len = start + 128 * PAGE_SIZE;
+		else
+			len = end;
+		tlb_gather_mmu(&tlb, mm, start, len);
+		mmu_notifier_invalidate_range_start(mm, start, len);
+		unmap_page_range(&tlb, vma, start, len, NULL);
+		mmu_notifier_invalidate_range_end(mm, start, len);
+		tlb_finish_mmu(&tlb, start, len);
+		start += len;
+		len = oom_victim_mm_score(mm);
+		if (len < score) {
+			pr_info("Direct OOM reaping = %lu\n", score - len);
+			return true;
+		}
+	}
+	return false;
+}
+
+bool oom_reap_mm(struct mm_struct *mm, const bool exiting)
 {
 	struct vm_area_struct *vma;
+	bool ret = false;
 
 	/*
 	 * Tell all users of get_user/copy_from_user etc... that the content
@@ -501,149 +554,14 @@  void __oom_reap_task_mm(struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			const unsigned long start = vma->vm_start;
-			const unsigned long end = vma->vm_end;
-			struct mmu_gather tlb;
-
-			tlb_gather_mmu(&tlb, mm, start, end);
-			mmu_notifier_invalidate_range_start(mm, start, end);
-			unmap_page_range(&tlb, vma, start, end, NULL);
-			mmu_notifier_invalidate_range_end(mm, start, end);
-			tlb_finish_mmu(&tlb, start, end);
+			ret = __oom_reap_vma(mm, vma, exiting);
+			if (ret)
+				break;
 		}
 	}
-}
-
-static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
-{
-	bool ret = true;
-
-	mutex_lock(&oom_lock);
-
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		ret = false;
-		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
-	}
-
-	/*
-	 * If the mm has invalidate_{start,end}() notifiers that could block,
-	 * sleep to give the oom victim some more time.
-	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time
-	 */
-	if (mm_has_blockable_invalidate_notifiers(mm)) {
-		up_read(&mm->mmap_sem);
-		schedule_timeout_idle(HZ);
-		goto unlock_oom;
-	}
-
-	/*
-	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-	 * under mmap_sem for reading because it serializes against the
-	 * down_write();up_write() cycle in exit_mmap().
-	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
-		up_read(&mm->mmap_sem);
-		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
-	}
-
-	trace_start_task_reaping(tsk->pid);
-
-	__oom_reap_task_mm(mm);
-
-	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-			task_pid_nr(tsk), tsk->comm,
-			K(get_mm_counter(mm, MM_ANONPAGES)),
-			K(get_mm_counter(mm, MM_FILEPAGES)),
-			K(get_mm_counter(mm, MM_SHMEMPAGES)));
-	up_read(&mm->mmap_sem);
-
-	trace_finish_task_reaping(tsk->pid);
-unlock_oom:
-	mutex_unlock(&oom_lock);
 	return ret;
 }
-
-#define MAX_OOM_REAP_RETRIES 10
-static void oom_reap_task(struct task_struct *tsk)
-{
-	int attempts = 0;
-	struct mm_struct *mm = tsk->signal->oom_mm;
-
-	/* Retry the down_read_trylock(mmap_sem) a few times */
-	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
-		schedule_timeout_idle(HZ/10);
-
-	if (attempts <= MAX_OOM_REAP_RETRIES ||
-	    test_bit(MMF_OOM_SKIP, &mm->flags))
-		goto done;
-
-	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-		task_pid_nr(tsk), tsk->comm);
-	debug_show_all_locks();
-
-done:
-	/*
-	 * Hide this mm from OOM killer because it has been either reaped or
-	 * somebody can't call up_write(mmap_sem).
-	 */
-	set_bit(MMF_OOM_SKIP, &mm->flags);
-
-	spin_lock(&oom_reaper_lock);
-	list_del(&tsk->oom_victim_list);
-	spin_unlock(&oom_reaper_lock);
-
-	/* Drop a reference taken by wake_oom_reaper */
-	put_task_struct(tsk);
-}
-
-static int oom_reaper(void *unused)
-{
-	while (true) {
-		struct task_struct *tsk = NULL;
-
-		wait_event_freezable(oom_reaper_wait,
-				     !list_empty(&oom_victim_list));
-		spin_lock(&oom_reaper_lock);
-		if (!list_empty(&oom_victim_list))
-			tsk = list_first_entry(&oom_victim_list,
-					       struct task_struct,
-					       oom_victim_list);
-		spin_unlock(&oom_reaper_lock);
-
-		if (tsk)
-			oom_reap_task(tsk);
-	}
-
-	return 0;
-}
-
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-	if (tsk->oom_victim_list.next)
-		return;
-	get_task_struct(tsk);
-	spin_lock(&oom_reaper_lock);
-	list_add_tail(&tsk->oom_victim_list, &oom_victim_list);
-	spin_unlock(&oom_reaper_lock);
-	trace_wake_reaper(tsk->pid);
-	wake_up(&oom_reaper_wait);
-}
-
-static int __init oom_init(void)
-{
-	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
-	return 0;
-}
-subsys_initcall(oom_init)
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
-}
-#endif /* CONFIG_MMU */
+#endif
 
 /**
  * mark_oom_victim - mark the given task as OOM victim
@@ -668,6 +586,13 @@  static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
+		/* Save current values for victim_mm_stalling() test. */
+		tsk->oom_reap_stall_count = 0;
+		tsk->last_oom_score = oom_victim_mm_score(mm);
+		tsk->last_oom_compared = jiffies;
+		get_task_struct(tsk);
+		lockdep_assert_held(&oom_lock);
+		list_add_tail(&tsk->oom_victim_list, &oom_victim_list);
 	}
 
 	/*
@@ -786,10 +711,11 @@  static bool task_will_free_mem(struct task_struct *task)
 		return false;
 
 	/*
-	 * This task has already been drained by the oom reaper so there are
-	 * only small chances it will free some more
+	 * If memory reserves granted to this task was not sufficient, allow
+	 * killing more processes after oom_has_pending_victims() completed
+	 * reaping this mm.
 	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags))
+	if (tsk_is_oom_victim(task))
 		return false;
 
 	if (atomic_read(&mm->mm_users) <= 1)
@@ -826,7 +752,6 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 	unsigned int victim_points = 0;
 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 					      DEFAULT_RATELIMIT_BURST);
-	bool can_oom_reap = true;
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -836,7 +761,6 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 	task_lock(p);
 	if (task_will_free_mem(p)) {
 		mark_oom_victim(p);
-		wake_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -925,8 +849,11 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 		if (same_thread_group(p, victim))
 			continue;
 		if (is_global_init(p)) {
-			can_oom_reap = false;
 			set_bit(MMF_OOM_SKIP, &mm->flags);
+			/*
+			 * Since oom_lock is held, unlike exit_mmap(), there is
+			 * no need to take mm->mmap_sem for write here.
+			 */
 			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
 					task_pid_nr(victim), victim->comm,
 					task_pid_nr(p), p->comm);
@@ -942,9 +869,6 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 	}
 	rcu_read_unlock();
 
-	if (can_oom_reap)
-		wake_oom_reaper(victim);
-
 	mmdrop(mm);
 	put_task_struct(victim);
 }
@@ -989,32 +913,67 @@  int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+void exit_oom_mm(struct mm_struct *mm)
+{
+	struct task_struct *p, *tmp;
+
+	mutex_lock(&oom_lock);
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		if (mm != p->signal->oom_mm)
+			continue;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
+	}
+	mutex_unlock(&oom_lock);
+}
+
 static bool oom_has_pending_victims(struct oom_control *oc)
 {
+	struct task_struct *p, *tmp;
+	bool ret = false;
+	bool gaveup = false;
+
+	lockdep_assert_held(&oom_lock);
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		struct mm_struct *mm = p->signal->oom_mm;
+
+		/* Skip OOM victims which current thread cannot select. */
+		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
+			continue;
+		ret = true;
 #ifdef CONFIG_MMU
-	struct task_struct *p;
+		/*
+		 * We need to hold mmap_sem for read, in order to safely test
+		 * MMF_OOM_SKIP flag and blockable invalidate notifiers.
+		 */
+		if (down_read_trylock(&mm->mmap_sem)) {
+			bool success = false;
 
-	if (is_sysrq_oom(oc))
-		return false;
-	/*
-	 * Since oom_reap_task_mm()/exit_mmap() will set MMF_OOM_SKIP, let's
-	 * wait for pending victims until MMF_OOM_SKIP is set.
-	 */
-	spin_lock(&oom_reaper_lock);
-	list_for_each_entry(p, &oom_victim_list, oom_victim_list)
-		if (!oom_unkillable_task(p, oc->memcg, oc->nodemask) &&
-		    !test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags))
-			break;
-	spin_unlock(&oom_reaper_lock);
-	return p != NULL;
-#else
-	/*
-	 * Since nobody except oom_kill_process() sets MMF_OOM_SKIP, waiting
-	 * for pending victims until MMF_OOM_SKIP is set is useless. Therefore,
-	 * simply let the OOM killer select pending victims again.
-	 */
-	return false;
+			if (!test_bit(MMF_OOM_SKIP, &mm->flags) &&
+			    !mm_has_blockable_invalidate_notifiers(mm))
+				success = oom_reap_mm(mm, false);
+			up_read(&mm->mmap_sem);
+			/*
+			 * Since we are doing allocation request, retry as soon
+			 * as current thread succeeded reclaiming some pages.
+			 */
+			if (success)
+				break;
+		}
 #endif
+		/* Forget if this mm didn't complete __mmput() for too long. */
+		if (!victim_mm_stalling(p, mm))
+			continue;
+		gaveup = true;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
+	}
+	if (gaveup)
+		debug_show_all_locks();
+
+	return ret && !is_sysrq_oom(oc);
 }
 
 /**
@@ -1048,7 +1007,6 @@  bool out_of_memory(struct oom_control *oc)
 	 */
 	if (task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		wake_oom_reaper(current);
 		return true;
 	}
 
@@ -1074,8 +1032,7 @@  bool out_of_memory(struct oom_control *oc)
 		return true;
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
-	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+	    oom_badness(current, NULL, oc->nodemask, oc->totalpages) > 0) {
 		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");