mm, oom: avoid printk() iteration under RCU
diff mbox series

Message ID 1563360901-8277-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
State New
Headers show
Series
  • mm, oom: avoid printk() iteration under RCU
Related show

Commit Message

Tetsuo Handa July 17, 2019, 10:55 a.m. UTC
Currently dump_tasks() might call printk() for many thousands times under
RCU, which might take many minutes for slow consoles. Therefore, split
dump_tasks() into three stages; take a snapshot of possible OOM victim
candidates under RCU, dump the snapshot from reschedulable context, and
destroy the snapshot.

In a future patch, the first stage would be moved to select_bad_process()
and the third stage would be moved to after oom_kill_process(), and will
simplify refcount handling.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 67 +++++++++++++++++++++++++--------------------------
 2 files changed, 34 insertions(+), 34 deletions(-)

Comments

Shakeel Butt July 18, 2019, 12:31 a.m. UTC | #1
On Wed, Jul 17, 2019 at 3:55 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles. Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
>
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.
>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Roman Gushchin <guro@fb.com>
> ---
>  include/linux/sched.h |  1 +
>  mm/oom_kill.c         | 67 +++++++++++++++++++++++++--------------------------
>  2 files changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8dc1811..cb6696b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1246,6 +1246,7 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>         struct task_struct              *oom_reaper_list;
>  #endif
> +       struct list_head                oom_victim_list;

Shouldn't there be INIT_LIST_HEAD(&tsk->oom_victim_list) somewhere?

>  #ifdef CONFIG_VMAP_STACK
>         struct vm_struct                *stack_vm_area;
>  #endif
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index eda2e2a..bd22ca0 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -377,36 +377,13 @@ static void select_bad_process(struct oom_control *oc)
>         }
>  }
>
> -static int dump_task(struct task_struct *p, void *arg)
> -{
> -       struct oom_control *oc = arg;
> -       struct task_struct *task;
> -
> -       if (oom_unkillable_task(p))
> -               return 0;
> -
> -       /* p may not have freeable memory in nodemask */
> -       if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> -               return 0;
>
> -       task = find_lock_task_mm(p);
> -       if (!task) {
> -               /*
> -                * This is a kthread or all of p's threads have already
> -                * detached their mm's.  There's no need to report
> -                * them; they can't be oom killed anyway.
> -                */
> -               return 0;
> +static int add_candidate_task(struct task_struct *p, void *arg)
> +{
> +       if (!oom_unkillable_task(p)) {
> +               get_task_struct(p);
> +               list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
>         }
> -
> -       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> -               task->pid, from_kuid(&init_user_ns, task_uid(task)),
> -               task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> -               mm_pgtables_bytes(task->mm),
> -               get_mm_counter(task->mm, MM_SWAPENTS),
> -               task->signal->oom_score_adj, task->comm);
> -       task_unlock(task);
> -
>         return 0;
>  }
>
> @@ -422,19 +399,41 @@ static int dump_task(struct task_struct *p, void *arg)
>   */
>  static void dump_tasks(struct oom_control *oc)
>  {
> -       pr_info("Tasks state (memory values in pages):\n");
> -       pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +       static LIST_HEAD(list);
> +       struct task_struct *p;
> +       struct task_struct *t;
>
>         if (is_memcg_oom(oc))
> -               mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +               mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
>         else {
> -               struct task_struct *p;
> -
>                 rcu_read_lock();
>                 for_each_process(p)
> -                       dump_task(p, oc);
> +                       add_candidate_task(p, &list);
>                 rcu_read_unlock();
>         }
> +       pr_info("Tasks state (memory values in pages):\n");
> +       pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +       list_for_each_entry(p, &list, oom_victim_list) {
> +               cond_resched();
> +               /* p may not have freeable memory in nodemask */
> +               if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +                       continue;
> +               /* All of p's threads might have already detached their mm's. */
> +               t = find_lock_task_mm(p);
> +               if (!t)
> +                       continue;
> +               pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +                       t->pid, from_kuid(&init_user_ns, task_uid(t)),
> +                       t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> +                       mm_pgtables_bytes(t->mm),
> +                       get_mm_counter(t->mm, MM_SWAPENTS),
> +                       t->signal->oom_score_adj, t->comm);
> +               task_unlock(t);
> +       }
> +       list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> +               list_del(&p->oom_victim_list);
> +               put_task_struct(p);
> +       }
>  }
>
>  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
> --
> 1.8.3.1
>
Michal Hocko July 18, 2019, 8:30 a.m. UTC | #2
On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles.

Is is even wise to enable dumping tasks on systems with thousands of
tasks and slow consoles? I mean you still have to call printk that is
slow that many times. So why do we actually care? Because of RCU stall
warnings?
Tetsuo Handa July 18, 2019, 10:22 a.m. UTC | #3
On 2019/07/18 9:31, Shakeel Butt wrote:
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 8dc1811..cb6696b 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1246,6 +1246,7 @@ struct task_struct {
>>  #ifdef CONFIG_MMU
>>         struct task_struct              *oom_reaper_list;
>>  #endif
>> +       struct list_head                oom_victim_list;
> 
> Shouldn't there be INIT_LIST_HEAD(&tsk->oom_victim_list) somewhere?

Yes if we need to use list_empty(&tsk->oom_victim_list) test.
This patch does not use such test; tsk->oom_victim_list is initialized
by list_add_tail() inside the OOM killer.

> 
>>  #ifdef CONFIG_VMAP_STACK
>>         struct vm_struct                *stack_vm_area;
>>  #endif
Tetsuo Handa July 18, 2019, 1:50 p.m. UTC | #4
On 2019/07/18 17:30, Michal Hocko wrote:
> On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
>> Currently dump_tasks() might call printk() for many thousands times under
>> RCU, which might take many minutes for slow consoles.
> 
> Is is even wise to enable dumping tasks on systems with thousands of
> tasks and slow consoles? I mean you still have to call printk that is
> slow that many times. So why do we actually care? Because of RCU stall
> warnings?
> 

That's a stupid question. WE DO CARE.
We are making efforts for avoid calling printk() on each thread group (e.g.

  commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")
  commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))

) under RCU and this patch is one of them (except that we can't remove
printk() for dump_tasks() case).
Michal Hocko July 18, 2019, 2:02 p.m. UTC | #5
On Thu 18-07-19 22:50:14, Tetsuo Handa wrote:
> On 2019/07/18 17:30, Michal Hocko wrote:
> > On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
> >> Currently dump_tasks() might call printk() for many thousands times under
> >> RCU, which might take many minutes for slow consoles.
> > 
> > Is is even wise to enable dumping tasks on systems with thousands of
> > tasks and slow consoles? I mean you still have to call printk that is
> > slow that many times. So why do we actually care? Because of RCU stall
> > warnings?
> > 
> 
> That's a stupid question. WE DO CARE.

-ENOARGUMENT

> We are making efforts for avoid calling printk() on each thread group (e.g.
> 
>   commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")

removes fatal_signal_pending rather than focusing on printk

>   commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))

removes a printk of a dubious value.

> ) under RCU and this patch is one of them (except that we can't remove
> printk() for dump_tasks() case).

No, this one adds a complexity for something that is not clearly a huge
win or the win is not explained properly.
Tetsuo Handa July 20, 2019, 11:29 a.m. UTC | #6
On 2019/07/18 23:02, Michal Hocko wrote:
> On Thu 18-07-19 22:50:14, Tetsuo Handa wrote:
>> On 2019/07/18 17:30, Michal Hocko wrote:
>>> On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
>>>> Currently dump_tasks() might call printk() for many thousands times under
>>>> RCU, which might take many minutes for slow consoles.
>>>
>>> Is is even wise to enable dumping tasks on systems with thousands of
>>> tasks and slow consoles? I mean you still have to call printk that is
>>> slow that many times. So why do we actually care? Because of RCU stall
>>> warnings?
>>>
>>
>> That's a stupid question. WE DO CARE.
> 
> -ENOARGUMENT
> 
>> We are making efforts for avoid calling printk() on each thread group (e.g.
>>
>>   commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")
> 
> removes fatal_signal_pending rather than focusing on printk

No. Its focus is to suppress printk(), for it fixes fatal_signal_pending() test
introduced by commit 840807a8f40bb25a ("mm/oom_kill.c: suppress unnecessary
"sharing same memory" message").

> 
>>   commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))
> 
> removes a printk of a dubious value.

No. Its focus is to remove printk(), for that printk() allows the system to
TASK_UNINTERRUPTIBLE stall for 44 days (even without slow consoles) in addition
to RCU stall for 2 minutes.

> 
>> ) under RCU and this patch is one of them (except that we can't remove
>> printk() for dump_tasks() case).
> 
> No, this one adds a complexity for something that is not clearly a huge
> win or the win is not explained properly.
> 

The win is already explained properly by the past commits. Avoiding RCU stalls
(even without slow consoles) is a clear win. The duration of RCU stall avoided
by this patch is roughly the same with commit b2b469939e934587.

We haven't succeeded making printk() asynchronous (and potentially we won't
succeed making printk() asynchronous because we need synchronous printk()
when something critical is undergoing outside of out_of_memory()). Thus,
bringing printk() to outside of RCU section is a clear win we can make for now.
Andrew Morton July 23, 2019, 11:14 p.m. UTC | #7
On Wed, 17 Jul 2019 19:55:01 +0900 Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> wrote:

> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles. Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
> 
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.

Look straightforward enough.

>
> ...
>
>  static void dump_tasks(struct oom_control *oc)
>  {
> -	pr_info("Tasks state (memory values in pages):\n");
> -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	static LIST_HEAD(list);

I don't think this needs to be static?

> +	struct task_struct *p;
> +	struct task_struct *t;
>  
>  	if (is_memcg_oom(oc))
> -		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
>  	else {
> -		struct task_struct *p;
> -
>  		rcu_read_lock();
>  		for_each_process(p)
> -			dump_task(p, oc);
> +			add_candidate_task(p, &list);
>  		rcu_read_unlock();
>  	}
> +	pr_info("Tasks state (memory values in pages):\n");
> +	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	list_for_each_entry(p, &list, oom_victim_list) {
> +		cond_resched();
> +		/* p may not have freeable memory in nodemask */
> +		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +			continue;
> +		/* All of p's threads might have already detached their mm's. */
> +		t = find_lock_task_mm(p);
> +		if (!t)
> +			continue;
> +		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +			t->pid, from_kuid(&init_user_ns, task_uid(t)),
> +			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> +			mm_pgtables_bytes(t->mm),
> +			get_mm_counter(t->mm, MM_SWAPENTS),
> +			t->signal->oom_score_adj, t->comm);
> +		task_unlock(t);
> +	}
> +	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> +		list_del(&p->oom_victim_list);
> +		put_task_struct(p);
> +	}
>  }
Tetsuo Handa July 24, 2019, 1:47 a.m. UTC | #8
On 2019/07/24 8:14, Andrew Morton wrote:
> On Wed, 17 Jul 2019 19:55:01 +0900 Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> wrote:
> 
>> Currently dump_tasks() might call printk() for many thousands times under
>> RCU, which might take many minutes for slow consoles. Therefore, split
>> dump_tasks() into three stages; take a snapshot of possible OOM victim
>> candidates under RCU, dump the snapshot from reschedulable context, and
>> destroy the snapshot.
>>
>> In a future patch, the first stage would be moved to select_bad_process()
>> and the third stage would be moved to after oom_kill_process(), and will
>> simplify refcount handling.
> 
> Look straightforward enough.

Thanks.

> 
>>
>> ...
>>
>>  static void dump_tasks(struct oom_control *oc)
>>  {
>> -	pr_info("Tasks state (memory values in pages):\n");
>> -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
>> +	static LIST_HEAD(list);
> 
> I don't think this needs to be static?

Well, the OOM killer is serialized by oom_lock mutex.
Thus, I guess we should reduce stack usage where reasonable.
For now you can drop this "static" if you want. But this
variable will be after all moved to outside of this function
by a future patch...

> 
>> +	struct task_struct *p;
>> +	struct task_struct *t;
>>
Michal Hocko Sept. 21, 2019, 8:30 p.m. UTC | #9
On Fri 20-09-19 17:10:42, Andrew Morton wrote:
> On Sat, 20 Jul 2019 20:29:23 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> 
> > > 
> > >> ) under RCU and this patch is one of them (except that we can't remove
> > >> printk() for dump_tasks() case).
> > > 
> > > No, this one adds a complexity for something that is not clearly a huge
> > > win or the win is not explained properly.
> > > 
> > 
> > The win is already explained properly by the past commits. Avoiding RCU stalls
> > (even without slow consoles) is a clear win. The duration of RCU stall avoided
> > by this patch is roughly the same with commit b2b469939e934587.
> > 
> > We haven't succeeded making printk() asynchronous (and potentially we won't
> > succeed making printk() asynchronous because we need synchronous printk()
> > when something critical is undergoing outside of out_of_memory()). Thus,
> > bringing printk() to outside of RCU section is a clear win we can make for now.
> 
> It's actually not a complex patch and moving all that printing outside
> the rcu section makes sense.  So I'll sit on the patch for a few more
> days but am inclined to send it upstream.

Look, I am quite tired of arguing about this and other changes following
the similar pattern. In short a problematic code is shuffled around and
pretend to solve some problem. In this particular case it is a RCU stall
which in itself is not a fatal condition. Sure it sucks and the primary
reason is that printk can take way too long. This is something that is
currently a WIP to be address. What is more important though there is no
sign of any _real world_ workload that would require a quick workaround
to justify a hacky stop gap solution.

So again, why do we want to add more code for something which is not
clear to be a real life problem and that will add a maintenance burden
for future?

Patch
diff mbox series

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dc1811..cb6696b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1246,6 +1246,7 @@  struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
 #endif
+	struct list_head		oom_victim_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a..bd22ca0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,36 +377,13 @@  static void select_bad_process(struct oom_control *oc)
 	}
 }
 
-static int dump_task(struct task_struct *p, void *arg)
-{
-	struct oom_control *oc = arg;
-	struct task_struct *task;
-
-	if (oom_unkillable_task(p))
-		return 0;
-
-	/* p may not have freeable memory in nodemask */
-	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
-		return 0;
 
-	task = find_lock_task_mm(p);
-	if (!task) {
-		/*
-		 * This is a kthread or all of p's threads have already
-		 * detached their mm's.  There's no need to report
-		 * them; they can't be oom killed anyway.
-		 */
-		return 0;
+static int add_candidate_task(struct task_struct *p, void *arg)
+{
+	if (!oom_unkillable_task(p)) {
+		get_task_struct(p);
+		list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
 	}
-
-	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
-		task->pid, from_kuid(&init_user_ns, task_uid(task)),
-		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-		mm_pgtables_bytes(task->mm),
-		get_mm_counter(task->mm, MM_SWAPENTS),
-		task->signal->oom_score_adj, task->comm);
-	task_unlock(task);
-
 	return 0;
 }
 
@@ -422,19 +399,41 @@  static int dump_task(struct task_struct *p, void *arg)
  */
 static void dump_tasks(struct oom_control *oc)
 {
-	pr_info("Tasks state (memory values in pages):\n");
-	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	static LIST_HEAD(list);
+	struct task_struct *p;
+	struct task_struct *t;
 
 	if (is_memcg_oom(oc))
-		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
 	else {
-		struct task_struct *p;
-
 		rcu_read_lock();
 		for_each_process(p)
-			dump_task(p, oc);
+			add_candidate_task(p, &list);
 		rcu_read_unlock();
 	}
+	pr_info("Tasks state (memory values in pages):\n");
+	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	list_for_each_entry(p, &list, oom_victim_list) {
+		cond_resched();
+		/* p may not have freeable memory in nodemask */
+		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+			continue;
+		/* All of p's threads might have already detached their mm's. */
+		t = find_lock_task_mm(p);
+		if (!t)
+			continue;
+		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+			t->pid, from_kuid(&init_user_ns, task_uid(t)),
+			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
+			mm_pgtables_bytes(t->mm),
+			get_mm_counter(t->mm, MM_SWAPENTS),
+			t->signal->oom_score_adj, t->comm);
+		task_unlock(t);
+	}
+	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
+		list_del(&p->oom_victim_list);
+		put_task_struct(p);
+	}
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)