diff mbox series

mm, oom: avoid printk() iteration under RCU

Message ID 1563360901-8277-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp (mailing list archive)
State New, archived
Headers show
Series mm, oom: avoid printk() iteration under RCU | expand

Commit Message

Tetsuo Handa July 17, 2019, 10:55 a.m. UTC
Currently dump_tasks() might call printk() for many thousands times under
RCU, which might take many minutes for slow consoles. Therefore, split
dump_tasks() into three stages; take a snapshot of possible OOM victim
candidates under RCU, dump the snapshot from reschedulable context, and
destroy the snapshot.

In a future patch, the first stage would be moved to select_bad_process()
and the third stage would be moved to after oom_kill_process(), and will
simplify refcount handling.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 67 +++++++++++++++++++++++++--------------------------
 2 files changed, 34 insertions(+), 34 deletions(-)

Comments

Shakeel Butt July 18, 2019, 12:31 a.m. UTC | #1
On Wed, Jul 17, 2019 at 3:55 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles. Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
>
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.
>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Roman Gushchin <guro@fb.com>
> ---
>  include/linux/sched.h |  1 +
>  mm/oom_kill.c         | 67 +++++++++++++++++++++++++--------------------------
>  2 files changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8dc1811..cb6696b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1246,6 +1246,7 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>         struct task_struct              *oom_reaper_list;
>  #endif
> +       struct list_head                oom_victim_list;

Shouldn't there be INIT_LIST_HEAD(&tsk->oom_victim_list) somewhere?

>  #ifdef CONFIG_VMAP_STACK
>         struct vm_struct                *stack_vm_area;
>  #endif
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index eda2e2a..bd22ca0 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -377,36 +377,13 @@ static void select_bad_process(struct oom_control *oc)
>         }
>  }
>
> -static int dump_task(struct task_struct *p, void *arg)
> -{
> -       struct oom_control *oc = arg;
> -       struct task_struct *task;
> -
> -       if (oom_unkillable_task(p))
> -               return 0;
> -
> -       /* p may not have freeable memory in nodemask */
> -       if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> -               return 0;
>
> -       task = find_lock_task_mm(p);
> -       if (!task) {
> -               /*
> -                * This is a kthread or all of p's threads have already
> -                * detached their mm's.  There's no need to report
> -                * them; they can't be oom killed anyway.
> -                */
> -               return 0;
> +static int add_candidate_task(struct task_struct *p, void *arg)
> +{
> +       if (!oom_unkillable_task(p)) {
> +               get_task_struct(p);
> +               list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
>         }
> -
> -       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> -               task->pid, from_kuid(&init_user_ns, task_uid(task)),
> -               task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> -               mm_pgtables_bytes(task->mm),
> -               get_mm_counter(task->mm, MM_SWAPENTS),
> -               task->signal->oom_score_adj, task->comm);
> -       task_unlock(task);
> -
>         return 0;
>  }
>
> @@ -422,19 +399,41 @@ static int dump_task(struct task_struct *p, void *arg)
>   */
>  static void dump_tasks(struct oom_control *oc)
>  {
> -       pr_info("Tasks state (memory values in pages):\n");
> -       pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +       static LIST_HEAD(list);
> +       struct task_struct *p;
> +       struct task_struct *t;
>
>         if (is_memcg_oom(oc))
> -               mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +               mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
>         else {
> -               struct task_struct *p;
> -
>                 rcu_read_lock();
>                 for_each_process(p)
> -                       dump_task(p, oc);
> +                       add_candidate_task(p, &list);
>                 rcu_read_unlock();
>         }
> +       pr_info("Tasks state (memory values in pages):\n");
> +       pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +       list_for_each_entry(p, &list, oom_victim_list) {
> +               cond_resched();
> +               /* p may not have freeable memory in nodemask */
> +               if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +                       continue;
> +               /* All of p's threads might have already detached their mm's. */
> +               t = find_lock_task_mm(p);
> +               if (!t)
> +                       continue;
> +               pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +                       t->pid, from_kuid(&init_user_ns, task_uid(t)),
> +                       t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> +                       mm_pgtables_bytes(t->mm),
> +                       get_mm_counter(t->mm, MM_SWAPENTS),
> +                       t->signal->oom_score_adj, t->comm);
> +               task_unlock(t);
> +       }
> +       list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> +               list_del(&p->oom_victim_list);
> +               put_task_struct(p);
> +       }
>  }
>
>  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
> --
> 1.8.3.1
>
Michal Hocko July 18, 2019, 8:30 a.m. UTC | #2
On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles.

Is is even wise to enable dumping tasks on systems with thousands of
tasks and slow consoles? I mean you still have to call printk that is
slow that many times. So why do we actually care? Because of RCU stall
warnings?
Tetsuo Handa July 18, 2019, 10:22 a.m. UTC | #3
On 2019/07/18 9:31, Shakeel Butt wrote:
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 8dc1811..cb6696b 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1246,6 +1246,7 @@ struct task_struct {
>>  #ifdef CONFIG_MMU
>>         struct task_struct              *oom_reaper_list;
>>  #endif
>> +       struct list_head                oom_victim_list;
> 
> Shouldn't there be INIT_LIST_HEAD(&tsk->oom_victim_list) somewhere?

Yes if we need to use list_empty(&tsk->oom_victim_list) test.
This patch does not use such test; tsk->oom_victim_list is initialized
by list_add_tail() inside the OOM killer.

> 
>>  #ifdef CONFIG_VMAP_STACK
>>         struct vm_struct                *stack_vm_area;
>>  #endif
Tetsuo Handa July 18, 2019, 1:50 p.m. UTC | #4
On 2019/07/18 17:30, Michal Hocko wrote:
> On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
>> Currently dump_tasks() might call printk() for many thousands times under
>> RCU, which might take many minutes for slow consoles.
> 
> Is is even wise to enable dumping tasks on systems with thousands of
> tasks and slow consoles? I mean you still have to call printk that is
> slow that many times. So why do we actually care? Because of RCU stall
> warnings?
> 

That's a stupid question. WE DO CARE.
We are making efforts for avoid calling printk() on each thread group (e.g.

  commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")
  commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))

) under RCU and this patch is one of them (except that we can't remove
printk() for dump_tasks() case).
Michal Hocko July 18, 2019, 2:02 p.m. UTC | #5
On Thu 18-07-19 22:50:14, Tetsuo Handa wrote:
> On 2019/07/18 17:30, Michal Hocko wrote:
> > On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
> >> Currently dump_tasks() might call printk() for many thousands times under
> >> RCU, which might take many minutes for slow consoles.
> > 
> > Is is even wise to enable dumping tasks on systems with thousands of
> > tasks and slow consoles? I mean you still have to call printk that is
> > slow that many times. So why do we actually care? Because of RCU stall
> > warnings?
> > 
> 
> That's a stupid question. WE DO CARE.

-ENOARGUMENT

> We are making efforts for avoid calling printk() on each thread group (e.g.
> 
>   commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")

removes fatal_signal_pending rather than focusing on printk

>   commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))

removes a printk of a dubious value.

> ) under RCU and this patch is one of them (except that we can't remove
> printk() for dump_tasks() case).

No, this one adds a complexity for something that is not clearly a huge
win or the win is not explained properly.
Tetsuo Handa July 20, 2019, 11:29 a.m. UTC | #6
On 2019/07/18 23:02, Michal Hocko wrote:
> On Thu 18-07-19 22:50:14, Tetsuo Handa wrote:
>> On 2019/07/18 17:30, Michal Hocko wrote:
>>> On Wed 17-07-19 19:55:01, Tetsuo Handa wrote:
>>>> Currently dump_tasks() might call printk() for many thousands times under
>>>> RCU, which might take many minutes for slow consoles.
>>>
>>> Is is even wise to enable dumping tasks on systems with thousands of
>>> tasks and slow consoles? I mean you still have to call printk that is
>>> slow that many times. So why do we actually care? Because of RCU stall
>>> warnings?
>>>
>>
>> That's a stupid question. WE DO CARE.
> 
> -ENOARGUMENT
> 
>> We are making efforts for avoid calling printk() on each thread group (e.g.
>>
>>   commit 0c1b2d783cf34324 ("mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()")
> 
> removes fatal_signal_pending rather than focusing on printk

No. Its focus is to suppress printk(), for it fixes fatal_signal_pending() test
introduced by commit 840807a8f40bb25a ("mm/oom_kill.c: suppress unnecessary
"sharing same memory" message").

> 
>>   commit b2b469939e934587 ("proc, oom: do not report alien mms when setting oom_score_adj"))
> 
> removes a printk of a dubious value.

No. Its focus is to remove printk(), for that printk() allows the system to
TASK_UNINTERRUPTIBLE stall for 44 days (even without slow consoles) in addition
to RCU stall for 2 minutes.

> 
>> ) under RCU and this patch is one of them (except that we can't remove
>> printk() for dump_tasks() case).
> 
> No, this one adds a complexity for something that is not clearly a huge
> win or the win is not explained properly.
> 

The win is already explained properly by the past commits. Avoiding RCU stalls
(even without slow consoles) is a clear win. The duration of RCU stall avoided
by this patch is roughly the same with commit b2b469939e934587.

We haven't succeeded making printk() asynchronous (and potentially we won't
succeed making printk() asynchronous because we need synchronous printk()
when something critical is undergoing outside of out_of_memory()). Thus,
bringing printk() to outside of RCU section is a clear win we can make for now.
Andrew Morton July 23, 2019, 11:14 p.m. UTC | #7
On Wed, 17 Jul 2019 19:55:01 +0900 Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> wrote:

> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles. Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
> 
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.

Look straightforward enough.

>
> ...
>
>  static void dump_tasks(struct oom_control *oc)
>  {
> -	pr_info("Tasks state (memory values in pages):\n");
> -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	static LIST_HEAD(list);

I don't think this needs to be static?

> +	struct task_struct *p;
> +	struct task_struct *t;
>  
>  	if (is_memcg_oom(oc))
> -		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
>  	else {
> -		struct task_struct *p;
> -
>  		rcu_read_lock();
>  		for_each_process(p)
> -			dump_task(p, oc);
> +			add_candidate_task(p, &list);
>  		rcu_read_unlock();
>  	}
> +	pr_info("Tasks state (memory values in pages):\n");
> +	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	list_for_each_entry(p, &list, oom_victim_list) {
> +		cond_resched();
> +		/* p may not have freeable memory in nodemask */
> +		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +			continue;
> +		/* All of p's threads might have already detached their mm's. */
> +		t = find_lock_task_mm(p);
> +		if (!t)
> +			continue;
> +		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +			t->pid, from_kuid(&init_user_ns, task_uid(t)),
> +			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> +			mm_pgtables_bytes(t->mm),
> +			get_mm_counter(t->mm, MM_SWAPENTS),
> +			t->signal->oom_score_adj, t->comm);
> +		task_unlock(t);
> +	}
> +	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> +		list_del(&p->oom_victim_list);
> +		put_task_struct(p);
> +	}
>  }
Tetsuo Handa July 24, 2019, 1:47 a.m. UTC | #8
On 2019/07/24 8:14, Andrew Morton wrote:
> On Wed, 17 Jul 2019 19:55:01 +0900 Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> wrote:
> 
>> Currently dump_tasks() might call printk() for many thousands times under
>> RCU, which might take many minutes for slow consoles. Therefore, split
>> dump_tasks() into three stages; take a snapshot of possible OOM victim
>> candidates under RCU, dump the snapshot from reschedulable context, and
>> destroy the snapshot.
>>
>> In a future patch, the first stage would be moved to select_bad_process()
>> and the third stage would be moved to after oom_kill_process(), and will
>> simplify refcount handling.
> 
> Look straightforward enough.

Thanks.

> 
>>
>> ...
>>
>>  static void dump_tasks(struct oom_control *oc)
>>  {
>> -	pr_info("Tasks state (memory values in pages):\n");
>> -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
>> +	static LIST_HEAD(list);
> 
> I don't think this needs to be static?

Well, the OOM killer is serialized by oom_lock mutex.
Thus, I guess we should reduce stack usage where reasonable.
For now you can drop this "static" if you want. But this
variable will be after all moved to outside of this function
by a future patch...

> 
>> +	struct task_struct *p;
>> +	struct task_struct *t;
>>
Michal Hocko Sept. 21, 2019, 8:30 p.m. UTC | #9
On Fri 20-09-19 17:10:42, Andrew Morton wrote:
> On Sat, 20 Jul 2019 20:29:23 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> 
> > > 
> > >> ) under RCU and this patch is one of them (except that we can't remove
> > >> printk() for dump_tasks() case).
> > > 
> > > No, this one adds a complexity for something that is not clearly a huge
> > > win or the win is not explained properly.
> > > 
> > 
> > The win is already explained properly by the past commits. Avoiding RCU stalls
> > (even without slow consoles) is a clear win. The duration of RCU stall avoided
> > by this patch is roughly the same with commit b2b469939e934587.
> > 
> > We haven't succeeded making printk() asynchronous (and potentially we won't
> > succeed making printk() asynchronous because we need synchronous printk()
> > when something critical is undergoing outside of out_of_memory()). Thus,
> > bringing printk() to outside of RCU section is a clear win we can make for now.
> 
> It's actually not a complex patch and moving all that printing outside
> the rcu section makes sense.  So I'll sit on the patch for a few more
> days but am inclined to send it upstream.

Look, I am quite tired of arguing about this and other changes following
the similar pattern. In short a problematic code is shuffled around and
pretend to solve some problem. In this particular case it is a RCU stall
which in itself is not a fatal condition. Sure it sucks and the primary
reason is that printk can take way too long. This is something that is
currently a WIP to be address. What is more important though there is no
sign of any _real world_ workload that would require a quick workaround
to justify a hacky stop gap solution.

So again, why do we want to add more code for something which is not
clear to be a real life problem and that will add a maintenance burden
for future?
Michal Hocko Sept. 22, 2019, 6:20 a.m. UTC | #10
On Sun 22-09-19 08:47:31, Tetsuo Handa wrote:
> On 2019/09/22 5:30, Michal Hocko wrote:
> > On Fri 20-09-19 17:10:42, Andrew Morton wrote:
> >> On Sat, 20 Jul 2019 20:29:23 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> >>
> >>>>
> >>>>> ) under RCU and this patch is one of them (except that we can't remove
> >>>>> printk() for dump_tasks() case).
> >>>>
> >>>> No, this one adds a complexity for something that is not clearly a huge
> >>>> win or the win is not explained properly.
> >>>>
> >>>
> >>> The win is already explained properly by the past commits. Avoiding RCU stalls
> >>> (even without slow consoles) is a clear win. The duration of RCU stall avoided
> >>> by this patch is roughly the same with commit b2b469939e934587.
> >>>
> >>> We haven't succeeded making printk() asynchronous (and potentially we won't
> >>> succeed making printk() asynchronous because we need synchronous printk()
> >>> when something critical is undergoing outside of out_of_memory()). Thus,
> >>> bringing printk() to outside of RCU section is a clear win we can make for now.
> >>
> >> It's actually not a complex patch and moving all that printing outside
> >> the rcu section makes sense.  So I'll sit on the patch for a few more
> >> days but am inclined to send it upstream.
> > 
> > Look, I am quite tired of arguing about this and other changes following
> > the similar pattern. In short a problematic code is shuffled around and
> > pretend to solve some problem. In this particular case it is a RCU stall
> > which in itself is not a fatal condition. Sure it sucks and the primary
> > reason is that printk can take way too long. This is something that is
> > currently a WIP to be address. What is more important though there is no
> > sign of any _real world_ workload that would require a quick workaround
> > to justify a hacky stop gap solution.
> > 
> > So again, why do we want to add more code for something which is not
> > clear to be a real life problem and that will add a maintenance burden
> > for future?
> > 
> 
> Enqueueing zillion printk() lines from dump_tasks() will overflow printk
> buffer (i.e. leads to lost messages) if OOM killer messages were printed
> asynchronously. I don't think that making printk() asynchronous will solve
> this problem. I repeat again; there is no better solution than "printk()
> users are careful not to exhaust the printk buffer". This patch is the
> first step towards avoiding thoughtless printk().

Irrelevant because this patch doesn't reduce the amount of output.

> Delay from dump_tasks() not only affects a thread holding oom_lock but also
> other threads which are directly doing concurrent allocation requests or
> indirectly waiting for the thread holding oom_lock. Your "it is a RCU stall
> which in itself is not a fatal condition" is underestimating the _real world_
> problems (e.g. "delay can trigger watchdog timeout and cause the system to
> reboot even if the administrator does not want the system to reboot").

Please back your claims by real world examples.
Michal Hocko Sept. 23, 2019, 8:23 a.m. UTC | #11
On Sun 22-09-19 20:30:51, Tetsuo Handa wrote:
> On 2019/09/22 15:20, Michal Hocko wrote:
> > On Sun 22-09-19 08:47:31, Tetsuo Handa wrote:
> >> On 2019/09/22 5:30, Michal Hocko wrote:
> >>> On Fri 20-09-19 17:10:42, Andrew Morton wrote:
> >>>> On Sat, 20 Jul 2019 20:29:23 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> >>>>
> >>>>>>
> >>>>>>> ) under RCU and this patch is one of them (except that we can't remove
> >>>>>>> printk() for dump_tasks() case).
> >>>>>>
> >>>>>> No, this one adds a complexity for something that is not clearly a huge
> >>>>>> win or the win is not explained properly.
> >>>>>>
> >>>>>
> >>>>> The win is already explained properly by the past commits. Avoiding RCU stalls
> >>>>> (even without slow consoles) is a clear win. The duration of RCU stall avoided
> >>>>> by this patch is roughly the same with commit b2b469939e934587.
> >>>>>
> >>>>> We haven't succeeded making printk() asynchronous (and potentially we won't
> >>>>> succeed making printk() asynchronous because we need synchronous printk()
> >>>>> when something critical is undergoing outside of out_of_memory()). Thus,
> >>>>> bringing printk() to outside of RCU section is a clear win we can make for now.
> >>>>
> >>>> It's actually not a complex patch and moving all that printing outside
> >>>> the rcu section makes sense.  So I'll sit on the patch for a few more
> >>>> days but am inclined to send it upstream.
> >>>
> >>> Look, I am quite tired of arguing about this and other changes following
> >>> the similar pattern. In short a problematic code is shuffled around and
> >>> pretend to solve some problem. In this particular case it is a RCU stall
> >>> which in itself is not a fatal condition. Sure it sucks and the primary
> >>> reason is that printk can take way too long. This is something that is
> >>> currently a WIP to be address. What is more important though there is no
> >>> sign of any _real world_ workload that would require a quick workaround
> >>> to justify a hacky stop gap solution.
> >>>
> >>> So again, why do we want to add more code for something which is not
> >>> clear to be a real life problem and that will add a maintenance burden
> >>> for future?
> >>>
> >>
> >> Enqueueing zillion printk() lines from dump_tasks() will overflow printk
> >> buffer (i.e. leads to lost messages) if OOM killer messages were printed
> >> asynchronously. I don't think that making printk() asynchronous will solve
> >> this problem. I repeat again; there is no better solution than "printk()
> >> users are careful not to exhaust the printk buffer". This patch is the
> >> first step towards avoiding thoughtless printk().
> > 
> > Irrelevant because this patch doesn't reduce the amount of output.
> 
> This patch is just a temporary change before applying
> https://lkml.kernel.org/r/7de2310d-afbd-e616-e83a-d75103b986c6@i-love.sakura.ne.jp and
> https://lkml.kernel.org/r/57be50b2-a97a-e559-e4bd-10d923895f83@i-love.sakura.ne.jp .
>
> Show your solution by patch instead of ignoring or nacking.

I simply suggest the most trivial patch which doesn't change any single
line of code.

This and the two discussion referenced by you simply confirm that a)
you didn't bother to think your change through for other potential
corner cases and b) add even more code in order to behave semi-sane.
 
> >> Delay from dump_tasks() not only affects a thread holding oom_lock but also
> >> other threads which are directly doing concurrent allocation requests or
> >> indirectly waiting for the thread holding oom_lock. Your "it is a RCU stall
> >> which in itself is not a fatal condition" is underestimating the _real world_
> >> problems (e.g. "delay can trigger watchdog timeout and cause the system to
> >> reboot even if the administrator does not want the system to reboot").
> > 
> > Please back your claims by real world examples.
> > 
> 
> People have to use /proc/sys/vm/oom_dump_tasks == 0 (and give up obtaining some
> clue) because they worry stalls caused by /proc/sys/vm/oom_dump_tasks != 0 while
> they have to use /proc/sys/vm/panic_on_oom == 0 because they don't want the down
> time caused by rebooting. And such situation cannot be solved unless we solve stalls
> caused by /proc/sys/vm/oom_dump_tasks != 0. I'm working at a support center and
> I have to be able to figure out the system's state, but I have neither environment
> to run real world workloads nor control of customer's environments to enforce
> /proc/sys/vm/oom_dump_tasks != 0.
> 
> In short, your "real world" requirement is a catch-22 problem.

I am pretty sure this would be less of a catch-22 problem if you had
more actual arguments at hands rather than constant hand waving. I have
told you many times and I will repeat one more time, and hopefully won't
have to again, even if there are issues in the code we always have to
weigh cost vs. benefits. If no real workloads are hitting these problems
while the fix in question is non-trivial, adds a maintenance burden or
even worse undermine the functionality (and dump_tasks printed at an
arbitrary time after the actual oom while you keep references to
task_structs really could be perceived that way) then a patch is simply
not worth it.

There are exceptions to that of course. If a more complex solution would
lead to a more robust code or functionality that other parts of the
kernel could benefit then this would be certainly an argument to weigh
in as well. E.g. improving tasks iteration to release rcu lock to yield
etc, improving printk etc.

I completely see how stress testing corner cases is useful and how it
might help the code in general but solely focusing on this testing is a
free one way ticket to unmaintainable mess.

This is my last email in this thread.
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dc1811..cb6696b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1246,6 +1246,7 @@  struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
 #endif
+	struct list_head		oom_victim_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a..bd22ca0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,36 +377,13 @@  static void select_bad_process(struct oom_control *oc)
 	}
 }
 
-static int dump_task(struct task_struct *p, void *arg)
-{
-	struct oom_control *oc = arg;
-	struct task_struct *task;
-
-	if (oom_unkillable_task(p))
-		return 0;
-
-	/* p may not have freeable memory in nodemask */
-	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
-		return 0;
 
-	task = find_lock_task_mm(p);
-	if (!task) {
-		/*
-		 * This is a kthread or all of p's threads have already
-		 * detached their mm's.  There's no need to report
-		 * them; they can't be oom killed anyway.
-		 */
-		return 0;
+static int add_candidate_task(struct task_struct *p, void *arg)
+{
+	if (!oom_unkillable_task(p)) {
+		get_task_struct(p);
+		list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
 	}
-
-	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
-		task->pid, from_kuid(&init_user_ns, task_uid(task)),
-		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-		mm_pgtables_bytes(task->mm),
-		get_mm_counter(task->mm, MM_SWAPENTS),
-		task->signal->oom_score_adj, task->comm);
-	task_unlock(task);
-
 	return 0;
 }
 
@@ -422,19 +399,41 @@  static int dump_task(struct task_struct *p, void *arg)
  */
 static void dump_tasks(struct oom_control *oc)
 {
-	pr_info("Tasks state (memory values in pages):\n");
-	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	static LIST_HEAD(list);
+	struct task_struct *p;
+	struct task_struct *t;
 
 	if (is_memcg_oom(oc))
-		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
 	else {
-		struct task_struct *p;
-
 		rcu_read_lock();
 		for_each_process(p)
-			dump_task(p, oc);
+			add_candidate_task(p, &list);
 		rcu_read_unlock();
 	}
+	pr_info("Tasks state (memory values in pages):\n");
+	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	list_for_each_entry(p, &list, oom_victim_list) {
+		cond_resched();
+		/* p may not have freeable memory in nodemask */
+		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+			continue;
+		/* All of p's threads might have already detached their mm's. */
+		t = find_lock_task_mm(p);
+		if (!t)
+			continue;
+		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+			t->pid, from_kuid(&init_user_ns, task_uid(t)),
+			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
+			mm_pgtables_bytes(t->mm),
+			get_mm_counter(t->mm, MM_SWAPENTS),
+			t->signal->oom_score_adj, t->comm);
+		task_unlock(t);
+	}
+	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
+		list_del(&p->oom_victim_list);
+		put_task_struct(p);
+	}
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)