diff mbox series

[v2,2/2] mm, oom: fix oom_unkillable_task for memcg OOMs

Message ID 20190617231207.160865-2-shakeelb@google.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/2] mm, oom: refactor dump_tasks for memcg OOMs | expand

Commit Message

Shakeel Butt June 17, 2019, 11:12 p.m. UTC
Currently oom_unkillable_task() checks mems_allowed even for memcg OOMs
which does not make sense as memcg OOMs can not be triggered due to
numa constraints. Fixing that.

This commit also removed the bogus usage of oom_unkillable_task() from
oom_badness(). Currently reading /proc/[pid]/oom_score will do a bogus
cpuset_mems_allowed_intersects() check. Removing that.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
Changelog since v1:
- Divide the patch into two patches.

 fs/proc/base.c      |  3 +--
 include/linux/oom.h |  1 -
 mm/oom_kill.c       | 28 +++++++++++++++-------------
 3 files changed, 16 insertions(+), 16 deletions(-)

Comments

Michal Hocko June 18, 2019, 12:34 p.m. UTC | #1
On Mon 17-06-19 16:12:07, Shakeel Butt wrote:
> Currently oom_unkillable_task() checks mems_allowed even for memcg OOMs
> which does not make sense as memcg OOMs can not be triggered due to
> numa constraints. Fixing that.

"Fixing that" is a poor description of the fix. Also it is quite useful
to note that it is not only bogus to check mems_allowed. It is also
harmful as per the syzbot test IIRC. Pasting the report here would
be helpful as well.

> This commit also removed the bogus usage of oom_unkillable_task() from
> oom_badness(). Currently reading /proc/[pid]/oom_score will do a bogus
> cpuset_mems_allowed_intersects() check. Removing that.

Again, there shouldn't be any real reason to squash the two things into
a single patch. This is a subtle bug/behavior on its own because the
result of oom_badness depends on the calling process context. This
should be called out in the changelog explicitly.

> Signed-off-by: Shakeel Butt <shakeelb@google.com>

Other than that it looks good to me. I will ack after the split out and
the changelog improvements.

Thanks!

> ---
> Changelog since v1:
> - Divide the patch into two patches.
> 
>  fs/proc/base.c      |  3 +--
>  include/linux/oom.h |  1 -
>  mm/oom_kill.c       | 28 +++++++++++++++-------------
>  3 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b8d5d100ed4a..57b7a0d75ef5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
>  	unsigned long totalpages = totalram_pages() + total_swap_pages;
>  	unsigned long points = 0;
>  
> -	points = oom_badness(task, NULL, NULL, totalpages) *
> -					1000 / totalpages;
> +	points = oom_badness(task, totalpages) * 1000 / totalpages;
>  	seq_printf(m, "%lu\n", points);
>  
>  	return 0;
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index d07992009265..c696c265f019 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
>  bool __oom_reap_task_mm(struct mm_struct *mm);
>  
>  extern unsigned long oom_badness(struct task_struct *p,
> -		struct mem_cgroup *memcg, const nodemask_t *nodemask,
>  		unsigned long totalpages);
>  
>  extern bool out_of_memory(struct oom_control *oc);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index bd80997e0969..d779d9da1069 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -152,20 +152,23 @@ static inline bool is_memcg_oom(struct oom_control *oc)
>  }
>  
>  /* return true if the task is not adequate as candidate victim task. */
> -static bool oom_unkillable_task(struct task_struct *p,
> -		struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static bool oom_unkillable_task(struct task_struct *p, struct oom_control *oc)
>  {
>  	if (is_global_init(p))
>  		return true;
>  	if (p->flags & PF_KTHREAD)
>  		return true;
>  
> -	/* When mem_cgroup_out_of_memory() and p is not member of the group */
> -	if (memcg && !task_in_mem_cgroup(p, memcg))
> -		return true;
> +	/*
> +	 * For memcg OOM, we reach here through mem_cgroup_scan_tasks(), no
> +	 * need to check p's memcg membership and the checks after this
> +	 * are irrelevant for memcg OOMs.
> +	 */
> +	if (is_memcg_oom(oc))
> +		return false;
>  
>  	/* p may not have freeable memory in nodemask */
> -	if (!has_intersects_mems_allowed(p, nodemask))
> +	if (!has_intersects_mems_allowed(p, oc->nodemask))
>  		return true;
>  
>  	return false;
> @@ -201,13 +204,12 @@ static bool is_dump_unreclaim_slabs(void)
>   * predictable as possible.  The goal is to return the highest value for the
>   * task consuming the most memory to avoid subsequent oom failures.
>   */
> -unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
> -			  const nodemask_t *nodemask, unsigned long totalpages)
> +unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
>  {
>  	long points;
>  	long adj;
>  
> -	if (oom_unkillable_task(p, memcg, nodemask))
> +	if (is_global_init(p) || p->flags & PF_KTHREAD)
>  		return 0;
>  
>  	p = find_lock_task_mm(p);
> @@ -318,7 +320,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
>  	struct oom_control *oc = arg;
>  	unsigned long points;
>  
> -	if (oom_unkillable_task(task, NULL, oc->nodemask))
> +	if (oom_unkillable_task(task, oc))
>  		goto next;
>  
>  	/*
> @@ -342,7 +344,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
>  		goto select;
>  	}
>  
> -	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
> +	points = oom_badness(task, oc->totalpages);
>  	if (!points || points < oc->chosen_points)
>  		goto next;
>  
> @@ -390,7 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
>  	struct oom_control *oc = arg;
>  	struct task_struct *task;
>  
> -	if (oom_unkillable_task(p, NULL, oc->nodemask))
> +	if (oom_unkillable_task(p, oc))
>  		return 0;
>  
>  	task = find_lock_task_mm(p);
> @@ -1090,7 +1092,7 @@ bool out_of_memory(struct oom_control *oc)
>  	check_panic_on_oom(oc, constraint);
>  
>  	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
> -	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
> +	    current->mm && !oom_unkillable_task(current, oc) &&
>  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
>  		get_task_struct(current);
>  		oc->chosen = current;
> -- 
> 2.22.0.410.gd8fdbe21b5-goog
>
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b8d5d100ed4a..57b7a0d75ef5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,8 +532,7 @@  static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, NULL, totalpages) *
-					1000 / totalpages;
+	points = oom_badness(task, totalpages) * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d07992009265..c696c265f019 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,6 @@  static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bd80997e0969..d779d9da1069 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -152,20 +152,23 @@  static inline bool is_memcg_oom(struct oom_control *oc)
 }
 
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p, struct oom_control *oc)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
+	/*
+	 * For memcg OOM, we reach here through mem_cgroup_scan_tasks(), no
+	 * need to check p's memcg membership and the checks after this
+	 * are irrelevant for memcg OOMs.
+	 */
+	if (is_memcg_oom(oc))
+		return false;
 
 	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
+	if (!has_intersects_mems_allowed(p, oc->nodemask))
 		return true;
 
 	return false;
@@ -201,13 +204,12 @@  static bool is_dump_unreclaim_slabs(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
-	if (oom_unkillable_task(p, memcg, nodemask))
+	if (is_global_init(p) || p->flags & PF_KTHREAD)
 		return 0;
 
 	p = find_lock_task_mm(p);
@@ -318,7 +320,7 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 	struct oom_control *oc = arg;
 	unsigned long points;
 
-	if (oom_unkillable_task(task, NULL, oc->nodemask))
+	if (oom_unkillable_task(task, oc))
 		goto next;
 
 	/*
@@ -342,7 +344,7 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+	points = oom_badness(task, oc->totalpages);
 	if (!points || points < oc->chosen_points)
 		goto next;
 
@@ -390,7 +392,7 @@  static int dump_task(struct task_struct *p, void *arg)
 	struct oom_control *oc = arg;
 	struct task_struct *task;
 
-	if (oom_unkillable_task(p, NULL, oc->nodemask))
+	if (oom_unkillable_task(p, oc))
 		return 0;
 
 	task = find_lock_task_mm(p);
@@ -1090,7 +1092,7 @@  bool out_of_memory(struct oom_control *oc)
 	check_panic_on_oom(oc, constraint);
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+	    current->mm && !oom_unkillable_task(current, oc) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;