[2/4] mm, oom: Avoid potential RCU stall at dump_tasks().
diff mbox series

Message ID 1558519686-16057-2-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
State New
Headers show
Series
  • [1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process().
Related show

Commit Message

Tetsuo Handa May 22, 2019, 10:08 a.m. UTC
dump_tasks() calls printk() on each userspace process under RCU which
might result in RCU stall warning. I proposed introducing timeout for
dump_tasks() operation, but Michal Hocko expects that dump_tasks() is
either print all or suppress all [1]. Therefore, this patch changes to
cache all candidates at oom_evaluate_task() and then print the cached
candidates at dump_tasks().

With this patch, dump_tasks() no longer need to call printk() under RCU.
Also, dump_tasks() no longer need to check oom_unkillable_task() by
traversing all userspace processes, for select_bad_process() already
traversed all possible candidates. Also, the OOM killer no longer need to
call put_task_struct() from atomic context, and we can simplify refcount
handling for __oom_kill_process().

This patch has a user-visible side effect that oom_kill_allocating_task
path implies oom_dump_tasks == 0, for oom_evaluate_task() is not called
via select_bad_process(). But since the purpose of enabling
oom_kill_allocating_task is to kill as quick as possible (i.e. tolerate
killing more than needed) whereas the purpose of dump_tasks() which might
take minutes is to understand how the OOM killer selected an OOM victim,
not printing candidates should be acceptable when the administrator asked
the OOM killer to kill current thread.

[1] https://lore.kernel.org/linux-mm/20180906115320.GS14951@dhcp22.suse.cz/

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 44 +++++++++++++++++++-------------------------
 2 files changed, 20 insertions(+), 25 deletions(-)

Patch
diff mbox series

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1183741..f1736bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,6 +1180,7 @@  struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
 #endif
+	struct list_head                oom_candidate_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7534046..00b594c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -63,6 +63,7 @@ 
  * and mark_oom_victim
  */
 DEFINE_MUTEX(oom_lock);
+static LIST_HEAD(oom_candidate_list);
 
 #ifdef CONFIG_NUMA
 /**
@@ -333,6 +334,9 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto abort;
 	}
 
+	get_task_struct(task);
+	list_add_tail(&task->oom_candidate_list, &oom_candidate_list);
+
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
@@ -350,16 +354,11 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
 		goto next;
 select:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	get_task_struct(task);
 	oc->chosen = task;
 	oc->chosen_points = points;
 next:
 	return 0;
 abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
 	oc->chosen = (void *)-1UL;
 	return 1;
 }
@@ -401,11 +400,8 @@  static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
-	rcu_read_lock();
-	for_each_process(p) {
-		if (oom_unkillable_task(p, memcg, nodemask))
-			continue;
-
+	list_for_each_entry(p, &oom_candidate_list, oom_candidate_list) {
+		cond_resched();
 		task = find_lock_task_mm(p);
 		if (!task) {
 			/*
@@ -424,7 +420,6 @@  static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
-	rcu_read_unlock();
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -455,7 +450,7 @@  static void dump_header(struct oom_control *oc, struct task_struct *p)
 		if (is_dump_unreclaim_slabs())
 			dump_unreclaimable_slab();
 	}
-	if (sysctl_oom_dump_tasks)
+	if (sysctl_oom_dump_tasks && !list_empty(&oom_candidate_list))
 		dump_tasks(oc->memcg, oc->nodemask);
 	if (p)
 		dump_oom_summary(oc, p);
@@ -849,17 +844,11 @@  static void __oom_kill_process(struct task_struct *victim, const char *message)
 	struct mm_struct *mm;
 	bool can_oom_reap = true;
 
-	p = find_lock_task_mm(victim);
-	if (!p) {
-		put_task_struct(victim);
-		return;
-	} else if (victim != p) {
-		get_task_struct(p);
-		put_task_struct(victim);
-		victim = p;
-	}
-
 	/* Get a reference to safely compare mm after task_unlock(victim) */
+	victim = find_lock_task_mm(victim);
+	if (!victim)
+		return;
+	get_task_struct(victim);
 	mm = victim->mm;
 	mmgrab(mm);
 
@@ -931,7 +920,6 @@  static int oom_kill_memcg_member(struct task_struct *task, void *message)
 {
 	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
 	    !is_global_init(task)) {
-		get_task_struct(task);
 		__oom_kill_process(task, message);
 	}
 	return 0;
@@ -954,7 +942,6 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 		mark_oom_victim(victim);
 		wake_oom_reaper(victim);
 		task_unlock(victim);
-		put_task_struct(victim);
 		return;
 	}
 	task_unlock(victim);
@@ -1077,7 +1064,6 @@  bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
 		return true;
@@ -1099,6 +1085,14 @@  bool out_of_memory(struct oom_control *oc)
 	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
+	while (!list_empty(&oom_candidate_list)) {
+		struct task_struct *p = list_first_entry(&oom_candidate_list,
+							 struct task_struct,
+							 oom_candidate_list);
+
+		list_del(&p->oom_candidate_list);
+		put_task_struct(p);
+	}
 	return !!oc->chosen;
 }