Message ID | 1563940476-6162-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm, oom: simplify task's refcount handling | expand |
On Wed 24-07-19 12:54:36, Tetsuo Handa wrote: > Currently out_of_memory() is full of get_task_struct()/put_task_struct() > calls. Since "mm, oom: avoid printk() iteration under RCU" introduced > a list for holding a snapshot of all OOM victim candidates, let's share > that list for select_bad_process() and oom_kill_process() in order to > simplify task's refcount handling. > > As a result of this patch, get_task_struct()/put_task_struct() calls > in out_of_memory() are reduced to only 2 times respectively. This is probably a matter of taste but the diffstat suggests to me that the simplification is not all that great. On the other hand this makes the oom handling even more tricky and harder for potential further development - e.g. if we ever need to break the global lock down in the future this would be another obstacle on the way. While potential development might be too theoretical the benefit of the patch is not really clear to me. The task_struct reference counting is not really unusual operations and there is nothing really scary that we do with it here. We already have to to extra mile wrt. task_lock so careful reference count doesn't really jump out. That being said, I do not think this patch gives any improvement. > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> > Cc: Shakeel Butt <shakeelb@google.com> > Cc: Michal Hocko <mhocko@suse.com> > Cc: Roman Gushchin <guro@fb.com> > Cc: David Rientjes <rientjes@google.com> > --- > include/linux/sched.h | 2 +- > mm/oom_kill.c | 122 ++++++++++++++++++++++++-------------------------- > 2 files changed, 60 insertions(+), 64 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 48c1a4c..4062999 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1247,7 +1247,7 @@ struct task_struct { > #ifdef CONFIG_MMU > struct task_struct *oom_reaper_list; > #endif > - struct list_head oom_victim_list; > + struct list_head oom_candidate; > #ifdef CONFIG_VMAP_STACK > struct vm_struct *stack_vm_area; > #endif > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 110f948..311e0e9 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -63,6 +63,7 @@ > * and mark_oom_victim > */ > DEFINE_MUTEX(oom_lock); > +static LIST_HEAD(oom_candidate_list); > > static inline bool is_memcg_oom(struct oom_control *oc) > { > @@ -167,6 +168,41 @@ static bool oom_unkillable_task(struct task_struct *p) > return false; > } > > +static int add_candidate_task(struct task_struct *p, void *unused) > +{ > + if (!oom_unkillable_task(p)) { > + get_task_struct(p); > + list_add_tail(&p->oom_candidate, &oom_candidate_list); > + } > + return 0; > +} > + > +static void link_oom_candidates(struct oom_control *oc) > +{ > + struct task_struct *p; > + > + if (is_memcg_oom(oc)) > + mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, NULL); > + else { > + rcu_read_lock(); > + for_each_process(p) > + add_candidate_task(p, NULL); > + rcu_read_unlock(); > + } > + > +} > + > +static void unlink_oom_candidates(void) > +{ > + struct task_struct *p; > + struct task_struct *t; > + > + list_for_each_entry_safe(p, t, &oom_candidate_list, oom_candidate) { > + list_del(&p->oom_candidate); > + put_task_struct(p); > + } > +} > + > /* > * Print out unreclaimble slabs info when unreclaimable slabs amount is greater > * than all user memory (LRU pages) > @@ -344,16 +380,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) > goto next; > > select: > - if (oc->chosen) > - put_task_struct(oc->chosen); > - get_task_struct(task); > oc->chosen = task; > oc->chosen_points = points; > next: > return 0; > abort: > - if (oc->chosen) > - put_task_struct(oc->chosen); > oc->chosen = (void *)-1UL; > return 1; > } > @@ -364,27 +395,13 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) > */ > static void select_bad_process(struct oom_control *oc) > { > - if (is_memcg_oom(oc)) > - mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); > - else { > - struct task_struct *p; > - > - rcu_read_lock(); > - for_each_process(p) > - if (oom_evaluate_task(p, oc)) > - break; > - rcu_read_unlock(); > - } > -} > - > + struct task_struct *p; > > -static int add_candidate_task(struct task_struct *p, void *arg) > -{ > - if (!oom_unkillable_task(p)) { > - get_task_struct(p); > - list_add_tail(&p->oom_victim_list, (struct list_head *) arg); > + list_for_each_entry(p, &oom_candidate_list, oom_candidate) { > + cond_resched(); > + if (oom_evaluate_task(p, oc)) > + break; > } > - return 0; > } > > /** > @@ -399,21 +416,12 @@ static int add_candidate_task(struct task_struct *p, void *arg) > */ > static void dump_tasks(struct oom_control *oc) > { > - static LIST_HEAD(list); > struct task_struct *p; > struct task_struct *t; > > - if (is_memcg_oom(oc)) > - mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list); > - else { > - rcu_read_lock(); > - for_each_process(p) > - add_candidate_task(p, &list); > - rcu_read_unlock(); > - } > pr_info("Tasks state (memory values in pages):\n"); > pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); > - list_for_each_entry(p, &list, oom_victim_list) { > + list_for_each_entry(p, &oom_candidate_list, oom_candidate) { > cond_resched(); > /* p may not have freeable memory in nodemask */ > if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) > @@ -430,10 +438,6 @@ static void dump_tasks(struct oom_control *oc) > t->signal->oom_score_adj, t->comm); > task_unlock(t); > } > - list_for_each_entry_safe(p, t, &list, oom_victim_list) { > - list_del(&p->oom_victim_list); > - put_task_struct(p); > - } > } > > static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) > @@ -859,17 +863,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) > bool can_oom_reap = true; > > p = find_lock_task_mm(victim); > - if (!p) { > - put_task_struct(victim); > + if (!p) > return; > - } else if (victim != p) { > - get_task_struct(p); > - put_task_struct(victim); > - victim = p; > - } > > - /* Get a reference to safely compare mm after task_unlock(victim) */ > - mm = victim->mm; > + /* Get a reference to safely compare mm after task_unlock(p) */ > + mm = p->mm; > mmgrab(mm); > > /* Raise event before sending signal: task reaper must see this */ > @@ -881,16 +879,15 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) > * in order to prevent the OOM victim from depleting the memory > * reserves from the user space under its control. > */ > - do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); > - mark_oom_victim(victim); > + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); > + mark_oom_victim(p); > pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", > - message, task_pid_nr(victim), victim->comm, > - K(victim->mm->total_vm), > - K(get_mm_counter(victim->mm, MM_ANONPAGES)), > - K(get_mm_counter(victim->mm, MM_FILEPAGES)), > - K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), > - from_kuid(&init_user_ns, task_uid(victim))); > - task_unlock(victim); > + message, task_pid_nr(p), p->comm, K(mm->total_vm), > + K(get_mm_counter(mm, MM_ANONPAGES)), > + K(get_mm_counter(mm, MM_FILEPAGES)), > + K(get_mm_counter(mm, MM_SHMEMPAGES)), > + from_kuid(&init_user_ns, task_uid(p))); > + task_unlock(p); > > /* > * Kill all user processes sharing victim->mm in other thread groups, if > @@ -929,7 +926,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) > wake_oom_reaper(victim); > > mmdrop(mm); > - put_task_struct(victim); > } > #undef K > > @@ -940,10 +936,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) > static int oom_kill_memcg_member(struct task_struct *task, void *message) > { > if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && > - !is_global_init(task)) { > - get_task_struct(task); > + !is_global_init(task)) > __oom_kill_process(task, message); > - } > return 0; > } > > @@ -964,7 +958,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) > mark_oom_victim(victim); > wake_oom_reaper(victim); > task_unlock(victim); > - put_task_struct(victim); > return; > } > task_unlock(victim); > @@ -1073,6 +1066,8 @@ bool out_of_memory(struct oom_control *oc) > if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) > return true; > > + link_oom_candidates(oc); > + > /* > * Check if there were limitations on the allocation (only relevant for > * NUMA and memcg) that may require different handling. > @@ -1086,10 +1081,9 @@ bool out_of_memory(struct oom_control *oc) > current->mm && !oom_unkillable_task(current) && > oom_cpuset_eligible(current, oc) && > current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { > - get_task_struct(current); > oc->chosen = current; > oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); > - return true; > + goto done; > } > > select_bad_process(oc); > @@ -1108,6 +1102,8 @@ bool out_of_memory(struct oom_control *oc) > if (oc->chosen && oc->chosen != (void *)-1UL) > oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : > "Memory cgroup out of memory"); > + done: > + unlink_oom_candidates(); > return !!oc->chosen; > } > > -- > 1.8.3.1
On 2019/07/24 15:41, Michal Hocko wrote: > On Wed 24-07-19 12:54:36, Tetsuo Handa wrote: >> Currently out_of_memory() is full of get_task_struct()/put_task_struct() >> calls. Since "mm, oom: avoid printk() iteration under RCU" introduced >> a list for holding a snapshot of all OOM victim candidates, let's share >> that list for select_bad_process() and oom_kill_process() in order to >> simplify task's refcount handling. >> >> As a result of this patch, get_task_struct()/put_task_struct() calls >> in out_of_memory() are reduced to only 2 times respectively. > > This is probably a matter of taste but the diffstat suggests to me that the > simplification is not all that great. On the other hand this makes the > oom handling even more tricky and harder for potential further > development - e.g. if we ever need to break the global lock down in the > future this would be another obstacle on the way. If we want to remove oom_lock serialization, we can implement it by doing INIT_LIST_HEAD(&p->oom_candidate) upon creating a thread and checking list_empty(&p->oom_candidate) under p->task_lock (or something) held when adding to local on-stack "oom_candidate_list" list stored in "oc". But we do not want to jumble concurrent OOM killer messages. Since it is dump_header() which takes majority of time, synchronous printk() will be the real obstacle on the way. I've tried removing oom_lock serialization, and got commit cbae05d32ff68233 ("printk: Pass caller information to log_store()."). The OOM killer is calling printk() in a manner that will jumble concurrent OOM killer messages... > While potential > development might be too theoretical the benefit of the patch is not > really clear to me. The task_struct reference counting is not really > unusual operations and there is nothing really scary that we do with it > here. We already have to to extra mile wrt. task_lock so careful > reference count doesn't really jump out. > > That being said, I do not think this patch gives any improvement. > This patch avoids RCU during select_bad_process(). This patch allows possibility of doing reschedulable things there; e.g. directly reaping only a portion of OOM victim's memory rather than wasting CPU resource by spinning until MMF_OOM_SKIP is set by the OOM reaper.
On Wed 24-07-19 16:37:35, Tetsuo Handa wrote: > On 2019/07/24 15:41, Michal Hocko wrote: [...] > > That being said, I do not think this patch gives any improvement. > > > > This patch avoids RCU during select_bad_process(). It just shifts where the RCU is taken. Do you have any numbers to show that this is an improvement? Basically the only potentially expensive thing down the oom_evaluate_task that I can see is the task_lock but I am not aware of a single report that this would be a contributor for RCU stalls. I can be proven wrong but > This patch allows > possibility of doing reschedulable things there; e.g. directly reaping > only a portion of OOM victim's memory rather than wasting CPU resource > by spinning until MMF_OOM_SKIP is set by the OOM reaper. We have been through direct oom reaping before and I haven't changed my possition there. It is just too tricky to be worth it.
On 2019/07/24 17:07, Michal Hocko wrote: > On Wed 24-07-19 16:37:35, Tetsuo Handa wrote: >> On 2019/07/24 15:41, Michal Hocko wrote: > [...] >>> That being said, I do not think this patch gives any improvement. >>> >> >> This patch avoids RCU during select_bad_process(). > > It just shifts where the RCU is taken. Do you have any numbers to show > that this is an improvement? Basically the only potentially expensive > thing down the oom_evaluate_task that I can see is the task_lock but I > am not aware of a single report that this would be a contributor for RCU > stalls. I can be proven wrong but > I don't have numbers (nor intent to show numbers). What I said is "we can do reschedulable things from select_bad_process() if future development found that it is nice to do, for oom_evaluate_task() is called without RCU". For now just cond_resched() would be added into select_bad_process() iteration. >> This patch allows >> possibility of doing reschedulable things there; e.g. directly reaping >> only a portion of OOM victim's memory rather than wasting CPU resource >> by spinning until MMF_OOM_SKIP is set by the OOM reaper. > > We have been through direct oom reaping before and I haven't changed my > possition there. It is just too tricky to be worth it. > Not limited to direct OOM reaping. Anything that future development would find. Anyway, traversing only once (by this patch) allows showing consistent snapshot of OOM victim candidates. In other words, this patch makes sure that OOM victim candidates shown by dump_tasks() are what select_bad_process() has evaluated, for you said that the main purpose of the listing is to double check the list to understand the OOM victim selection. This patch removes race window of adding or removing OOM victim candidates.
diff --git a/include/linux/sched.h b/include/linux/sched.h index 48c1a4c..4062999 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,7 +1247,7 @@ struct task_struct { #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif - struct list_head oom_victim_list; + struct list_head oom_candidate; #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 110f948..311e0e9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -63,6 +63,7 @@ * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +static LIST_HEAD(oom_candidate_list); static inline bool is_memcg_oom(struct oom_control *oc) { @@ -167,6 +168,41 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } +static int add_candidate_task(struct task_struct *p, void *unused) +{ + if (!oom_unkillable_task(p)) { + get_task_struct(p); + list_add_tail(&p->oom_candidate, &oom_candidate_list); + } + return 0; +} + +static void link_oom_candidates(struct oom_control *oc) +{ + struct task_struct *p; + + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, NULL); + else { + rcu_read_lock(); + for_each_process(p) + add_candidate_task(p, NULL); + rcu_read_unlock(); + } + +} + +static void unlink_oom_candidates(void) +{ + struct task_struct *p; + struct task_struct *t; + + list_for_each_entry_safe(p, t, &oom_candidate_list, oom_candidate) { + list_del(&p->oom_candidate); + put_task_struct(p); + } +} + /* * Print out unreclaimble slabs info when unreclaimable slabs amount is greater * than all user memory (LRU pages) @@ -344,16 +380,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); - get_task_struct(task); oc->chosen = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); oc->chosen = (void *)-1UL; return 1; } @@ -364,27 +395,13 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) */ static void select_bad_process(struct oom_control *oc) { - if (is_memcg_oom(oc)) - mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); - else { - struct task_struct *p; - - rcu_read_lock(); - for_each_process(p) - if (oom_evaluate_task(p, oc)) - break; - rcu_read_unlock(); - } -} - + struct task_struct *p; -static int add_candidate_task(struct task_struct *p, void *arg) -{ - if (!oom_unkillable_task(p)) { - get_task_struct(p); - list_add_tail(&p->oom_victim_list, (struct list_head *) arg); + list_for_each_entry(p, &oom_candidate_list, oom_candidate) { + cond_resched(); + if (oom_evaluate_task(p, oc)) + break; } - return 0; } /** @@ -399,21 +416,12 @@ static int add_candidate_task(struct task_struct *p, void *arg) */ static void dump_tasks(struct oom_control *oc) { - static LIST_HEAD(list); struct task_struct *p; struct task_struct *t; - if (is_memcg_oom(oc)) - mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list); - else { - rcu_read_lock(); - for_each_process(p) - add_candidate_task(p, &list); - rcu_read_unlock(); - } pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); - list_for_each_entry(p, &list, oom_victim_list) { + list_for_each_entry(p, &oom_candidate_list, oom_candidate) { cond_resched(); /* p may not have freeable memory in nodemask */ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) @@ -430,10 +438,6 @@ static void dump_tasks(struct oom_control *oc) t->signal->oom_score_adj, t->comm); task_unlock(t); } - list_for_each_entry_safe(p, t, &list, oom_victim_list) { - list_del(&p->oom_victim_list); - put_task_struct(p); - } } static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) @@ -859,17 +863,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) bool can_oom_reap = true; p = find_lock_task_mm(victim); - if (!p) { - put_task_struct(victim); + if (!p) return; - } else if (victim != p) { - get_task_struct(p); - put_task_struct(victim); - victim = p; - } - /* Get a reference to safely compare mm after task_unlock(victim) */ - mm = victim->mm; + /* Get a reference to safely compare mm after task_unlock(p) */ + mm = p->mm; mmgrab(mm); /* Raise event before sending signal: task reaper must see this */ @@ -881,16 +879,15 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ - do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); - mark_oom_victim(victim); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + mark_oom_victim(p); pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", - message, task_pid_nr(victim), victim->comm, - K(victim->mm->total_vm), - K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), - from_kuid(&init_user_ns, task_uid(victim))); - task_unlock(victim); + message, task_pid_nr(p), p->comm, K(mm->total_vm), + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(p))); + task_unlock(p); /* * Kill all user processes sharing victim->mm in other thread groups, if @@ -929,7 +926,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) wake_oom_reaper(victim); mmdrop(mm); - put_task_struct(victim); } #undef K @@ -940,10 +936,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) static int oom_kill_memcg_member(struct task_struct *task, void *message) { if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && - !is_global_init(task)) { - get_task_struct(task); + !is_global_init(task)) __oom_kill_process(task, message); - } return 0; } @@ -964,7 +958,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) mark_oom_victim(victim); wake_oom_reaper(victim); task_unlock(victim); - put_task_struct(victim); return; } task_unlock(victim); @@ -1073,6 +1066,8 @@ bool out_of_memory(struct oom_control *oc) if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; + link_oom_candidates(oc); + /* * Check if there were limitations on the allocation (only relevant for * NUMA and memcg) that may require different handling. @@ -1086,10 +1081,9 @@ bool out_of_memory(struct oom_control *oc) current->mm && !oom_unkillable_task(current) && oom_cpuset_eligible(current, oc) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { - get_task_struct(current); oc->chosen = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); - return true; + goto done; } select_bad_process(oc); @@ -1108,6 +1102,8 @@ bool out_of_memory(struct oom_control *oc) if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); + done: + unlink_oom_candidates(); return !!oc->chosen; }
Currently out_of_memory() is full of get_task_struct()/put_task_struct() calls. Since "mm, oom: avoid printk() iteration under RCU" introduced a list for holding a snapshot of all OOM victim candidates, let's share that list for select_bad_process() and oom_kill_process() in order to simplify task's refcount handling. As a result of this patch, get_task_struct()/put_task_struct() calls in out_of_memory() are reduced to only 2 times respectively. Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Shakeel Butt <shakeelb@google.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Roman Gushchin <guro@fb.com> Cc: David Rientjes <rientjes@google.com> --- include/linux/sched.h | 2 +- mm/oom_kill.c | 122 ++++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 64 deletions(-)