[4/4] mm, oom: Fix unnecessary killing of additional processes.
diff mbox series

Message ID 1533389386-3501-4-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
State New
Headers show
Series
  • [1/4] mm, oom: Remove wake_oom_reaper().
Related show

Commit Message

Tetsuo Handa Aug. 4, 2018, 1:29 p.m. UTC
David Rientjes is complaining about current behavior that the OOM killer
selects next OOM victim as soon as MMF_OOM_SKIP is set even if
__oom_reap_task_mm() returned without any progress.

To address this problem, this patch adds a timeout with whether the OOM
score of an OOM victim's memory is decreasing over time as a feedback,
after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/sched.h |  3 ++
 mm/oom_kill.c         | 81 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 63 insertions(+), 21 deletions(-)

Comments

Michal Hocko Aug. 6, 2018, 1:45 p.m. UTC | #1
On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> David Rientjes is complaining about current behavior that the OOM killer
> selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> __oom_reap_task_mm() returned without any progress.
> 
> To address this problem, this patch adds a timeout with whether the OOM
> score of an OOM victim's memory is decreasing over time as a feedback,
> after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().

I still hate any feedback mechanism based on time. We have seen that
these paths are completely non-deterministic time wise that building
any heuristic on top of it just sounds wrong.

Yes we have problems that the oom reaper doesn't handle all types of
memory yet. We should cover most of reasonably large memory types by
now. There is still mlock to take care of and that would be much
preferable to work on ragardless the retry mechanism becuase this work
will simply not handle that case either.

So I do not really see this would be an improvement. I still stand by my
argument that any retry mechanism should be based on the direct feedback
from the oom reaper rather than some magic "this took that long without
any progress".

> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Roman Gushchin <guro@fb.com>
> ---
>  include/linux/sched.h |  3 ++
>  mm/oom_kill.c         | 81 ++++++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 63 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 589fe78..70c7dfd 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1174,6 +1174,9 @@ struct task_struct {
>  #endif
>  	int				pagefault_disabled;
>  	struct list_head		oom_victim_list;
> +	unsigned long			last_oom_compared;
> +	unsigned long			last_oom_score;
> +	unsigned char			oom_reap_stall_count;
>  #ifdef CONFIG_VMAP_STACK
>  	struct vm_struct		*stack_vm_area;
>  #endif
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 783f04d..7cad886 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -49,6 +49,12 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/oom.h>
>  
> +static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
> +{
> +	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
> +		mm_pgtables_bytes(mm) / PAGE_SIZE;
> +}
> +
>  int sysctl_panic_on_oom;
>  int sysctl_oom_kill_allocating_task;
>  int sysctl_oom_dump_tasks = 1;
> @@ -230,8 +236,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
>  	 * The baseline for the badness score is the proportion of RAM that each
>  	 * task's rss, pagetable and swap space use.
>  	 */
> -	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
> -		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
> +	points = oom_victim_mm_score(p->mm);
>  	task_unlock(p);
>  
>  	/* Normalize to oom_score_adj units */
> @@ -571,15 +576,6 @@ static void oom_reap_task(struct task_struct *tsk)
>  	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
>  		schedule_timeout_idle(HZ/10);
>  
> -	if (attempts <= MAX_OOM_REAP_RETRIES ||
> -	    test_bit(MMF_OOM_SKIP, &mm->flags))
> -		goto done;
> -
> -	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
> -		task_pid_nr(tsk), tsk->comm);
> -	debug_show_all_locks();
> -
> -done:
>  	/*
>  	 * Hide this mm from OOM killer because it has been either reaped or
>  	 * somebody can't call up_write(mmap_sem).
> @@ -631,6 +627,9 @@ static void mark_oom_victim(struct task_struct *tsk)
>  	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
>  		mmgrab(tsk->signal->oom_mm);
>  		set_bit(MMF_OOM_VICTIM, &mm->flags);
> +		tsk->last_oom_compared = jiffies;
> +		tsk->last_oom_score = oom_victim_mm_score(mm);
> +		tsk->oom_reap_stall_count = 0;
>  		get_task_struct(tsk);
>  		list_add(&tsk->oom_victim_list, &oom_victim_list);
>  	}
> @@ -867,7 +866,6 @@ static void __oom_kill_process(struct task_struct *victim)
>  	mmdrop(mm);
>  	put_task_struct(victim);
>  }
> -#undef K
>  
>  /*
>   * Kill provided task unless it's secured by setting
> @@ -999,33 +997,74 @@ int unregister_oom_notifier(struct notifier_block *nb)
>  }
>  EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>  
> +static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
> +{
> +	unsigned long score;
> +
> +	if (time_before(jiffies, p->last_oom_compared + HZ / 10))
> +		return false;
> +	score = oom_victim_mm_score(mm);
> +	if (score < p->last_oom_score)
> +		p->oom_reap_stall_count = 0;
> +	else
> +		p->oom_reap_stall_count++;
> +	p->last_oom_score = oom_victim_mm_score(mm);
> +	p->last_oom_compared = jiffies;
> +	if (p->oom_reap_stall_count < 30)
> +		return false;
> +	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
> +		task_pid_nr(p), p->comm, K(mm->total_vm),
> +		K(get_mm_counter(mm, MM_ANONPAGES)),
> +		K(get_mm_counter(mm, MM_FILEPAGES)),
> +		K(get_mm_counter(mm, MM_SHMEMPAGES)));
> +	return true;
> +}
> +
>  static bool oom_has_pending_victims(struct oom_control *oc)
>  {
> -	struct task_struct *p;
> +	struct task_struct *p, *tmp;
> +	bool ret = false;
> +	bool gaveup = false;
>  
>  	if (is_sysrq_oom(oc))
>  		return false;
>  	/*
> -	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
> -	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
> -	 * completes.
> +	 * Wait for pending victims until __mmput() completes or stalled
> +	 * too long.
>  	 */
> -	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
> +	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
> +		struct mm_struct *mm = p->signal->oom_mm;
> +
>  		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
>  			continue;
> -		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
> +		ret = true;
>  #ifdef CONFIG_MMU
> +		/*
> +		 * Since the OOM reaper exists, we can safely wait until
> +		 * MMF_OOM_SKIP is set.
> +		 */
> +		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
>  			if (!oom_reap_target) {
>  				get_task_struct(p);
>  				oom_reap_target = p;
>  				trace_wake_reaper(p->pid);
>  				wake_up(&oom_reaper_wait);
>  			}
> -#endif
> -			return true;
> +			continue;
>  		}
> +#endif
> +		/* We can wait as long as OOM score is decreasing over time. */
> +		if (!victim_mm_stalling(p, mm))
> +			continue;
> +		gaveup = true;
> +		list_del(&p->oom_victim_list);
> +		/* Drop a reference taken by mark_oom_victim(). */
> +		put_task_struct(p);
>  	}
> -	return false;
> +	if (gaveup)
> +		debug_show_all_locks();
> +
> +	return ret;
>  }
>  
>  /**
> -- 
> 1.8.3.1
David Rientjes Aug. 6, 2018, 8:19 p.m. UTC | #2
On Mon, 6 Aug 2018, Michal Hocko wrote:

> On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> > David Rientjes is complaining about current behavior that the OOM killer
> > selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> > __oom_reap_task_mm() returned without any progress.
> > 
> > To address this problem, this patch adds a timeout with whether the OOM
> > score of an OOM victim's memory is decreasing over time as a feedback,
> > after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().
> 
> I still hate any feedback mechanism based on time. We have seen that
> these paths are completely non-deterministic time wise that building
> any heuristic on top of it just sounds wrong.
> 
> Yes we have problems that the oom reaper doesn't handle all types of
> memory yet. We should cover most of reasonably large memory types by
> now. There is still mlock to take care of and that would be much
> preferable to work on ragardless the retry mechanism becuase this work
> will simply not handle that case either.
> 
> So I do not really see this would be an improvement. I still stand by my
> argument that any retry mechanism should be based on the direct feedback
> from the oom reaper rather than some magic "this took that long without
> any progress".
> 

At the risk of continually repeating the same statement, the oom reaper 
cannot provide the direct feedback for all possible memory freeing.  
Waking up periodically and finding mm->mmap_sem contended is one problem, 
but the other problem that I've already shown is the unnecessary oom 
killing of additional processes while a thread has already reached 
exit_mmap().  The oom reaper cannot free page tables which is problematic 
for malloc implementations such as tcmalloc that do not release virtual 
memory.  For binaries with heaps that are very large, sometimes over 
100GB, this is a substantial amount of memory and we have seen unnecessary 
oom killing before and during free_pgtables() of the victim.  This is long 
after the oom reaper would operate on any mm.
Michal Hocko Aug. 6, 2018, 8:51 p.m. UTC | #3
On Mon 06-08-18 13:19:18, David Rientjes wrote:
> On Mon, 6 Aug 2018, Michal Hocko wrote:
> 
> > On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> > > David Rientjes is complaining about current behavior that the OOM killer
> > > selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> > > __oom_reap_task_mm() returned without any progress.
> > > 
> > > To address this problem, this patch adds a timeout with whether the OOM
> > > score of an OOM victim's memory is decreasing over time as a feedback,
> > > after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().
> > 
> > I still hate any feedback mechanism based on time. We have seen that
> > these paths are completely non-deterministic time wise that building
> > any heuristic on top of it just sounds wrong.
> > 
> > Yes we have problems that the oom reaper doesn't handle all types of
> > memory yet. We should cover most of reasonably large memory types by
> > now. There is still mlock to take care of and that would be much
> > preferable to work on ragardless the retry mechanism becuase this work
> > will simply not handle that case either.
> > 
> > So I do not really see this would be an improvement. I still stand by my
> > argument that any retry mechanism should be based on the direct feedback
> > from the oom reaper rather than some magic "this took that long without
> > any progress".
> > 
> 
> At the risk of continually repeating the same statement, the oom reaper 
> cannot provide the direct feedback for all possible memory freeing.  
> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> but the other problem that I've already shown is the unnecessary oom 
> killing of additional processes while a thread has already reached 
> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> for malloc implementations such as tcmalloc that do not release virtual 
> memory. 

But once we know that the exit path is past the point of blocking we can
have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
oom_reaper doesn't hide the current victim too early and we can safely
wait for the exit path to reclaim the rest. So there is a feedback
channel. I would even do not mind to poll for that state few times -
similar to polling for the mmap_sem. But it would still be some feedback
rather than a certain amount of time has passed since the last check.

> For binaries with heaps that are very large, sometimes over 
> 100GB, this is a substantial amount of memory and we have seen unnecessary 
> oom killing before and during free_pgtables() of the victim.  This is long 
> after the oom reaper would operate on any mm.

Well, a specific example would be really helpful. I have to admit I
haven't seen many oom victim without any memory mapped to the address
space. I can construct pathological corner cases of course but well, is
this a reasonable usecase to base the implementation on? A malicious user
can usually find other ways how to hurt the system and that's why it
should be contained.
David Rientjes Aug. 9, 2018, 8:16 p.m. UTC | #4
On Mon, 6 Aug 2018, Michal Hocko wrote:

> > At the risk of continually repeating the same statement, the oom reaper 
> > cannot provide the direct feedback for all possible memory freeing.  
> > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > but the other problem that I've already shown is the unnecessary oom 
> > killing of additional processes while a thread has already reached 
> > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > for malloc implementations such as tcmalloc that do not release virtual 
> > memory. 
> 
> But once we know that the exit path is past the point of blocking we can
> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> oom_reaper doesn't hide the current victim too early and we can safely
> wait for the exit path to reclaim the rest. So there is a feedback
> channel. I would even do not mind to poll for that state few times -
> similar to polling for the mmap_sem. But it would still be some feedback
> rather than a certain amount of time has passed since the last check.
> 

Yes, of course, it would be easy to rely on exit_mmap() to set 
MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
when we are assured of forward progress.  What polling are you proposing 
other than a timeout based mechanism to do this?

We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
complete free_pgtables() for that mm.  The problem is the same: when does 
the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
been set in a timely manner?

If this is an argument that the oom reaper should loop checking for 
MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
than just setting the jiffies in the mm itself, that's just implementing 
the same thing and doing so in a way where the oom reaper stalls operating 
on a single mm rather than round-robin iterating over mm's in my patch.
Michal Hocko Aug. 10, 2018, 9:07 a.m. UTC | #5
On Thu 09-08-18 13:16:25, David Rientjes wrote:
> On Mon, 6 Aug 2018, Michal Hocko wrote:
> 
> > > At the risk of continually repeating the same statement, the oom reaper 
> > > cannot provide the direct feedback for all possible memory freeing.  
> > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > but the other problem that I've already shown is the unnecessary oom 
> > > killing of additional processes while a thread has already reached 
> > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > for malloc implementations such as tcmalloc that do not release virtual 
> > > memory. 
> > 
> > But once we know that the exit path is past the point of blocking we can
> > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > oom_reaper doesn't hide the current victim too early and we can safely
> > wait for the exit path to reclaim the rest. So there is a feedback
> > channel. I would even do not mind to poll for that state few times -
> > similar to polling for the mmap_sem. But it would still be some feedback
> > rather than a certain amount of time has passed since the last check.
> > 
> 
> Yes, of course, it would be easy to rely on exit_mmap() to set 
> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> when we are assured of forward progress.  What polling are you proposing 
> other than a timeout based mechanism to do this?

I was thinking about doing something like the following
- oom_reaper checks the amount of victim's memory after it is done with
  reaping (e.g. by calling oom_badness before and after). If it wasn't able to
  reclaim much then return false and keep retrying with the existing
  mechanism
- once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
  MMF_OOM_SKIP flag.

> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> complete free_pgtables() for that mm.  The problem is the same: when does 
> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> been set in a timely manner?

reuse the current retry policy which is the number of attempts rather
than any timeout.

> If this is an argument that the oom reaper should loop checking for 
> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> than just setting the jiffies in the mm itself, that's just implementing 
> the same thing and doing so in a way where the oom reaper stalls operating 
> on a single mm rather than round-robin iterating over mm's in my patch.

I've said earlier that I do not mind doing round robin in the oom repaer
but this is certainly more complex than what we do now and I haven't
seen any actual example where it would matter. OOM reaper is a safely
measure. Nothing should fall apart if it is slow. The primary work
should be happening from the exit path anyway.
Tetsuo Handa Aug. 10, 2018, 10:54 a.m. UTC | #6
On 2018/08/10 18:07, Michal Hocko wrote:
>> Yes, of course, it would be easy to rely on exit_mmap() to set 
>> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
>> when we are assured of forward progress.  What polling are you proposing 
>> other than a timeout based mechanism to do this?
> 
> I was thinking about doing something like the following
> - oom_reaper checks the amount of victim's memory after it is done with
>   reaping (e.g. by calling oom_badness before and after). 

OK. We can apply

+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}

and

-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);

part, can't we?

>                                                           If it wasn't able to
>   reclaim much then return false and keep retrying with the existing
>   mechanism

How do you decide whether oom_reaper() was not able to reclaim much?

> - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
>   MMF_OOM_SKIP flag.

Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
considered premature. oom_reaper() will have to keep retrying....

> 
>> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
>> complete free_pgtables() for that mm.  The problem is the same: when does 
>> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
>> been set in a timely manner?
> 
> reuse the current retry policy which is the number of attempts rather
> than any timeout.

And this is really I can't understand. The number of attempts multiplied
by retry interval _is_ nothing but timeout.

We are already using timeout based decision, with some attempt to reclaim
memory if conditions are met.

> 
>> If this is an argument that the oom reaper should loop checking for 
>> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
>> than just setting the jiffies in the mm itself, that's just implementing 
>> the same thing and doing so in a way where the oom reaper stalls operating 
>> on a single mm rather than round-robin iterating over mm's in my patch.
> 
> I've said earlier that I do not mind doing round robin in the oom repaer
> but this is certainly more complex than what we do now and I haven't
> seen any actual example where it would matter. OOM reaper is a safely
> measure. Nothing should fall apart if it is slow. 

The OOM reaper can fail if allocating threads have high priority. You seem to
assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
threads do only cond_resched() due to your "the cargo cult programming" refusal,
and like Andrew Morton commented

  cond_resched() is a no-op in the presence of realtime policy threads
  and using to attempt to yield to a different thread it in this fashion
  is broken.

at "mm: disable preemption before swapcache_free" thread, we can't guarantee
that allocating threads shall give the OOM reaper enough CPU resource for
making forward progress. And my direct OOM reaping proposal was also refused
by you. I really dislike counting OOM reaper as a safety measure.

>                                                   The primary work
> should be happening from the exit path anyway.
>
Michal Hocko Aug. 10, 2018, 11:16 a.m. UTC | #7
On Fri 10-08-18 19:54:39, Tetsuo Handa wrote:
> On 2018/08/10 18:07, Michal Hocko wrote:
> >> Yes, of course, it would be easy to rely on exit_mmap() to set 
> >> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> >> when we are assured of forward progress.  What polling are you proposing 
> >> other than a timeout based mechanism to do this?
> > 
> > I was thinking about doing something like the following
> > - oom_reaper checks the amount of victim's memory after it is done with
> >   reaping (e.g. by calling oom_badness before and after). 
> 
> OK. We can apply
> 
> +static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
> +{
> +	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
> +		mm_pgtables_bytes(mm) / PAGE_SIZE;
> +}
> 
> and
> 
> -	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
> -		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
> +	points = oom_victim_mm_score(p->mm);
> 
> part, can't we?
> 
> >                                                           If it wasn't able to
> >   reclaim much then return false and keep retrying with the existing
> >   mechanism
> 
> How do you decide whether oom_reaper() was not able to reclaim much?

Just a rule of thumb. If it freed at least few kBs then we should be good
to MMF_OOM_SKIP.
 
> > - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
> >   MMF_OOM_SKIP flag.
> 
> Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
> considered premature. oom_reaper() will have to keep retrying....

there absolutely have to be a cap for retrying. Otherwise you have
lockup scenarios back when the memory is mostly consumed by page tables.

> >> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> >> complete free_pgtables() for that mm.  The problem is the same: when does 
> >> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> >> been set in a timely manner?
> > 
> > reuse the current retry policy which is the number of attempts rather
> > than any timeout.
> 
> And this is really I can't understand. The number of attempts multiplied
> by retry interval _is_ nothing but timeout.

Yes it is a timeout but it is not the time that matters. It is that we
have tried sufficient times. Looks at it this way. You can retry 5 times
in 10s or just once. Depending on what is going on in the system. I
would really prefer the behavior to be deterministic.

> We are already using timeout based decision, with some attempt to reclaim
> memory if conditions are met.

Timeout based decision is when you, well, make a decision after a
certain time passes. And we do not do that.
 
> >> If this is an argument that the oom reaper should loop checking for 
> >> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> >> than just setting the jiffies in the mm itself, that's just implementing 
> >> the same thing and doing so in a way where the oom reaper stalls operating 
> >> on a single mm rather than round-robin iterating over mm's in my patch.
> > 
> > I've said earlier that I do not mind doing round robin in the oom repaer
> > but this is certainly more complex than what we do now and I haven't
> > seen any actual example where it would matter. OOM reaper is a safely
> > measure. Nothing should fall apart if it is slow. 
> 
> The OOM reaper can fail if allocating threads have high priority. You seem to
> assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
> threads do only cond_resched() due to your "the cargo cult programming" refusal,
> and like Andrew Morton commented
> 
>   cond_resched() is a no-op in the presence of realtime policy threads
>   and using to attempt to yield to a different thread it in this fashion
>   is broken.
> 
> at "mm: disable preemption before swapcache_free" thread, we can't guarantee
> that allocating threads shall give the OOM reaper enough CPU resource for
> making forward progress. And my direct OOM reaping proposal was also refused
> by you. I really dislike counting OOM reaper as a safety measure.

Well, yeah, you can screw up your system with real time priority tasks
all you want. I really fail to see why you are bringing that up now
though. Yet another offtopic?
Tetsuo Handa Aug. 11, 2018, 3:12 a.m. UTC | #8
On 2018/08/10 20:16, Michal Hocko wrote:
>> How do you decide whether oom_reaper() was not able to reclaim much?
> 
> Just a rule of thumb. If it freed at least few kBs then we should be good
> to MMF_OOM_SKIP.

I don't think so. We are talking about situations where MMF_OOM_SKIP is set
before memory enough to prevent the OOM killer from selecting next OOM victim
was reclaimed.

>> Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
>> considered premature. oom_reaper() will have to keep retrying....
> 
> there absolutely have to be a cap for retrying. Otherwise you have
> lockup scenarios back when the memory is mostly consumed by page tables.

Right, we absolutely need a cap for retrying.

>>>> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
>>>> complete free_pgtables() for that mm.  The problem is the same: when does 
>>>> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
>>>> been set in a timely manner?
>>>
>>> reuse the current retry policy which is the number of attempts rather
>>> than any timeout.
>>
>> And this is really I can't understand. The number of attempts multiplied
>> by retry interval _is_ nothing but timeout.
> 
> Yes it is a timeout but it is not the time that matters. It is that we
> have tried sufficient times. Looks at it this way. You can retry 5 times
> in 10s or just once. Depending on what is going on in the system. I
> would really prefer the behavior to be deterministic.

What is the difference between

// Reclaim attempt by the OOM reaper
	for_each_OOM_victim_mm_without_MMF_OOM_SKIP {
		for (attempts = 0; attempts < MAX_OOM_REAP_RETRIES &&
		     !test_bit(MMF_EXIT_MMAP, &mm->flags); attempts++) {
			oom_reap_task_mm(tsk, mm):
			schedule_timeout_idle(HZ/10);
		}
		// It is time to make final decision
		if (test_bit(MMF_EXIT_MMAP, &mm->flags))
			continue;
		pr_info("Gave up waiting for process %d (%s) ...\n", ...);
		set_bit(MMF_OOM_SKIP, &mm->flags); // Allow selecting next OOM victim.
	}

(I assume this is what you call "reuse the current retry policy") and

// Initialization at mark_oom_victim()
	mm->last_reap_attempted = jiffies;
	mm->reap_attempted = 0;

// Reclaim attempt by allocating thread
	// Try allocation while waiting before oom_reap_task_mm()
	page = get_page_from_freelist(...);
	if (page)
		return page;
	for_each_OOM_victim_mm_without_MMF_OOM_SKIP {
		// Check if it is time to try oom_reap_task_mm()
		if (!time_after(jiffies, mm->last_reap_attempted + HZ / 10))
			continue;
		oom_reap_task_mm(tsk, mm);
		mm->last_reap_attempted = jiffies;
		if (mm->reap_attempted++ <= MAX_OOM_REAP_RETRIES)
			continue;
		// It is time to make final decision
		if (test_bit(MMF_EXIT_MMAP, &mm->flags))
			continue;
		pr_info("Gave up waiting for process %d (%s) ...\n", ...);
		set_bit(MMF_OOM_SKIP, &mm->flags); // Allow selecting next OOM victim.
	}

(this is what I call "direct OOM reaping") ?

Apart from the former is "sequential processing" and "the OOM reaper pays the cost
for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
allocating thread pays the cost for reclaiming", both are timeout based back off
with number of retry attempt with a cap.

>> We are already using timeout based decision, with some attempt to reclaim
>> memory if conditions are met.
> 
> Timeout based decision is when you, well, make a decision after a
> certain time passes. And we do not do that.

But we are talking about what we can do after oom_reap_task_mm() can no longer
make progress. Both the former and the latter will wait until a time controlled
by the number of attempts and retry interval elapses.

>>>> If this is an argument that the oom reaper should loop checking for 
>>>> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
>>>> than just setting the jiffies in the mm itself, that's just implementing 
>>>> the same thing and doing so in a way where the oom reaper stalls operating 
>>>> on a single mm rather than round-robin iterating over mm's in my patch.
>>>
>>> I've said earlier that I do not mind doing round robin in the oom repaer
>>> but this is certainly more complex than what we do now and I haven't
>>> seen any actual example where it would matter. OOM reaper is a safely
>>> measure. Nothing should fall apart if it is slow. 
>>
>> The OOM reaper can fail if allocating threads have high priority. You seem to
>> assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
>> threads do only cond_resched() due to your "the cargo cult programming" refusal,
>> and like Andrew Morton commented
>>
>>   cond_resched() is a no-op in the presence of realtime policy threads
>>   and using to attempt to yield to a different thread it in this fashion
>>   is broken.
>>
>> at "mm: disable preemption before swapcache_free" thread, we can't guarantee
>> that allocating threads shall give the OOM reaper enough CPU resource for
>> making forward progress. And my direct OOM reaping proposal was also refused
>> by you. I really dislike counting OOM reaper as a safety measure.
> 
> Well, yeah, you can screw up your system with real time priority tasks
> all you want. I really fail to see why you are bringing that up now
> though. Yet another offtopic?
> 

Not offtopic at all. My direct OOM reaping proposal is exactly handling such
situation. And I already suggested how we could avoid forcing some allocating
thread to pay the full cost for reclaiming all reclaimable memory.
Michal Hocko Aug. 14, 2018, 11:33 a.m. UTC | #9
On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
> On 2018/08/10 20:16, Michal Hocko wrote:
> >> How do you decide whether oom_reaper() was not able to reclaim much?
> > 
> > Just a rule of thumb. If it freed at least few kBs then we should be good
> > to MMF_OOM_SKIP.
> 
> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
> before memory enough to prevent the OOM killer from selecting next OOM victim
> was reclaimed.

There is nothing like enough memory to prevent a new victim selection.
Just think of streaming source of allocation without any end. There is
simply no way to tell that we have freed enough. We have to guess and
tune based on reasonable workloads.

[...]
> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> allocating thread pays the cost for reclaiming", both are timeout based back off
> with number of retry attempt with a cap.

And it is exactly the who pays the price concern I've already tried to
explain that bothers me.

I really do not see how making the code more complex by ensuring that
allocators share a fair part of the direct oom repaing will make the
situation any easier. Really there are basically two issues we really
should be after. Improve the oom reaper to tear down wider range of
memory (namely mlock) and to improve the cooperation with the exit path
to handle free_pgtables more gracefully because it is true that some
processes might really consume a lot of memory in page tables without
mapping  a lot of anonymous memory. Neither of the two is addressed by
your proposal. So if you want to help then try to think about the two
issues.

> >> We are already using timeout based decision, with some attempt to reclaim
> >> memory if conditions are met.
> > 
> > Timeout based decision is when you, well, make a decision after a
> > certain time passes. And we do not do that.
> 
> But we are talking about what we can do after oom_reap_task_mm() can no longer
> make progress. Both the former and the latter will wait until a time controlled
> by the number of attempts and retry interval elapses.

Do not confuse a sleep with the number of attempts. The latter is a unit
of work done the former is a unit of time.
Tetsuo Handa Aug. 19, 2018, 2:23 p.m. UTC | #10
On 2018/08/14 20:33, Michal Hocko wrote:
> On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
>> On 2018/08/10 20:16, Michal Hocko wrote:
>>>> How do you decide whether oom_reaper() was not able to reclaim much?
>>>
>>> Just a rule of thumb. If it freed at least few kBs then we should be good
>>> to MMF_OOM_SKIP.
>>
>> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
>> before memory enough to prevent the OOM killer from selecting next OOM victim
>> was reclaimed.
> 
> There is nothing like enough memory to prevent a new victim selection.
> Just think of streaming source of allocation without any end. There is
> simply no way to tell that we have freed enough. We have to guess and
> tune based on reasonable workloads.

I'm not talking about "allocation without any end" case.
We already inserted fatal_signal_pending(current) checks (except vmalloc()
where tsk_is_oom_victim(current) would be used instead).

What we are talking about is a situation where we could avoid selecting next
OOM victim if we waited for some more time after MMF_OOM_SKIP was set.

>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>> allocating thread pays the cost for reclaiming", both are timeout based back off
>> with number of retry attempt with a cap.
> 
> And it is exactly the who pays the price concern I've already tried to
> explain that bothers me.

Are you aware that we can fall into situation where nobody can pay the price for
reclaiming memory?

> 
> I really do not see how making the code more complex by ensuring that
> allocators share a fair part of the direct oom repaing will make the
> situation any easier.

You are completely ignoring/misunderstanding the background of
commit 9bfe5ded054b8e28 ("mm, oom: remove sleep from under oom_lock").

That patch was applied in order to mitigate a lockup problem caused by the fact
that allocators can deprive the OOM reaper of all CPU resources for making progress
due to very very broken assumption at

        /*
         * Acquire the oom lock.  If that fails, somebody else is
         * making progress for us.
         */
        if (!mutex_trylock(&oom_lock)) {
                *did_some_progress = 1;
                schedule_timeout_uninterruptible(1);
                return NULL;
        }

on the allocator side.

Direct OOM reaping is a method for ensuring that allocators spend _some_ CPU
resources for making progress. I already showed how to prevent allocators from
trying to reclaim all (e.g. multiple TB) memory at once because you worried it.

>                       Really there are basically two issues we really
> should be after. Improve the oom reaper to tear down wider range of
> memory (namely mlock) and to improve the cooperation with the exit path
> to handle free_pgtables more gracefully because it is true that some
> processes might really consume a lot of memory in page tables without
> mapping  a lot of anonymous memory. Neither of the two is addressed by
> your proposal. So if you want to help then try to think about the two
> issues.

Your "improvement" is to tear down wider range of memory whereas
my "improvement" is to ensure that CPU resource is spent for reclaiming memory and
David's "improvement" is to mitigate unnecessary killing of additional processes.
Therefore, your "Neither of the two is addressed by your proposal." is pointless.
David Rientjes Aug. 19, 2018, 11:45 p.m. UTC | #11
> > > > At the risk of continually repeating the same statement, the oom reaper 
> > > > cannot provide the direct feedback for all possible memory freeing.  
> > > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > > but the other problem that I've already shown is the unnecessary oom 
> > > > killing of additional processes while a thread has already reached 
> > > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > > for malloc implementations such as tcmalloc that do not release virtual 
> > > > memory. 
> > > 
> > > But once we know that the exit path is past the point of blocking we can
> > > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > > oom_reaper doesn't hide the current victim too early and we can safely
> > > wait for the exit path to reclaim the rest. So there is a feedback
> > > channel. I would even do not mind to poll for that state few times -
> > > similar to polling for the mmap_sem. But it would still be some feedback
> > > rather than a certain amount of time has passed since the last check.
> > > 
> > 
> > Yes, of course, it would be easy to rely on exit_mmap() to set 
> > MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> > when we are assured of forward progress.  What polling are you proposing 
> > other than a timeout based mechanism to do this?
> 
> I was thinking about doing something like the following
> - oom_reaper checks the amount of victim's memory after it is done with
>   reaping (e.g. by calling oom_badness before and after). If it wasn't able to
>   reclaim much then return false and keep retrying with the existing
>   mechanism

I'm not sure how you define the threshold to consider what is substantial 
memory freeing.

> - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
>   MMF_OOM_SKIP flag.
> 
> > We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> > complete free_pgtables() for that mm.  The problem is the same: when does 
> > the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> > been set in a timely manner?
> 
> reuse the current retry policy which is the number of attempts rather
> than any timeout.
> 
> > If this is an argument that the oom reaper should loop checking for 
> > MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> > than just setting the jiffies in the mm itself, that's just implementing 
> > the same thing and doing so in a way where the oom reaper stalls operating 
> > on a single mm rather than round-robin iterating over mm's in my patch.
> 
> I've said earlier that I do not mind doing round robin in the oom repaer
> but this is certainly more complex than what we do now and I haven't
> seen any actual example where it would matter. OOM reaper is a safely
> measure. Nothing should fall apart if it is slow. The primary work
> should be happening from the exit path anyway.

The oom reaper will always be unable to free some memory, such as page 
tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
also can give up early.  The munlock() case is another example.  We 
experience unnecessary oom killing during free_pgtables() where the 
single-threaded exit_mmap() is freeing an enormous amount of page tables 
(usually a malloc implementation such as tcmalloc that does not free 
virtual memory) and other processes are faulting faster than we can free.  
It's a combination of a multiprocessor system and a lot of virtual memory 
from the original victim.  This is the same case as being unable to 
munlock quickly enough in exit_mmap() to free the memory.

We must wait until free_pgtables() completes in exit_mmap() before killing 
additional processes in the large majority (99.96% of cases from my data) 
of instances where oom livelock does not occur.  In the remainder of 
situations, livelock has been prevented by what the oom reaper has been 
able to free.  We can, of course, not do free_pgtables() from the oom 
reaper.  So my approach was to allow for a reasonable amount of time for 
the victim to free a lot of memory before declaring that additional 
processes must be oom killed.  It would be functionally similar to having 
the oom reaper retry many, many more times than 10 and having a linked 
list of mm_structs to reap.  I don't care one way or another if it's a 
timeout based solution or many, many retries that have schedule_timeout() 
that yields the same time period in the end.
Michal Hocko Aug. 20, 2018, 5:54 a.m. UTC | #12
On Sun 19-08-18 23:23:41, Tetsuo Handa wrote:
> On 2018/08/14 20:33, Michal Hocko wrote:
> > On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
> >> On 2018/08/10 20:16, Michal Hocko wrote:
> >>>> How do you decide whether oom_reaper() was not able to reclaim much?
> >>>
> >>> Just a rule of thumb. If it freed at least few kBs then we should be good
> >>> to MMF_OOM_SKIP.
> >>
> >> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
> >> before memory enough to prevent the OOM killer from selecting next OOM victim
> >> was reclaimed.
> > 
> > There is nothing like enough memory to prevent a new victim selection.
> > Just think of streaming source of allocation without any end. There is
> > simply no way to tell that we have freed enough. We have to guess and
> > tune based on reasonable workloads.
> 
> I'm not talking about "allocation without any end" case.
> We already inserted fatal_signal_pending(current) checks (except vmalloc()
> where tsk_is_oom_victim(current) would be used instead).
> 
> What we are talking about is a situation where we could avoid selecting next
> OOM victim if we waited for some more time after MMF_OOM_SKIP was set.

And that some more time is undefined without a crystal ball. And we have
desperately shortage of those.
 
> >> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> >> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> >> allocating thread pays the cost for reclaiming", both are timeout based back off
> >> with number of retry attempt with a cap.
> > 
> > And it is exactly the who pays the price concern I've already tried to
> > explain that bothers me.
> 
> Are you aware that we can fall into situation where nobody can pay the price for
> reclaiming memory?

I fail to see how this is related to direct vs. kthread oom reaping
though. Unless the kthread is starved by other means then it can always
jump in and handle the situation.

> > I really do not see how making the code more complex by ensuring that
> > allocators share a fair part of the direct oom repaing will make the
> > situation any easier.
> 
> You are completely ignoring/misunderstanding the background of
> commit 9bfe5ded054b8e28 ("mm, oom: remove sleep from under oom_lock").
> 
> That patch was applied in order to mitigate a lockup problem caused by the fact
> that allocators can deprive the OOM reaper of all CPU resources for making progress
> due to very very broken assumption at
> 
>         /*
>          * Acquire the oom lock.  If that fails, somebody else is
>          * making progress for us.
>          */
>         if (!mutex_trylock(&oom_lock)) {
>                 *did_some_progress = 1;
>                 schedule_timeout_uninterruptible(1);
>                 return NULL;
>         }
> 
> on the allocator side.
> 
> Direct OOM reaping is a method for ensuring that allocators spend _some_ CPU
> resources for making progress. I already showed how to prevent allocators from
> trying to reclaim all (e.g. multiple TB) memory at once because you worried it.
> 
> >                       Really there are basically two issues we really
> > should be after. Improve the oom reaper to tear down wider range of
> > memory (namely mlock) and to improve the cooperation with the exit path
> > to handle free_pgtables more gracefully because it is true that some
> > processes might really consume a lot of memory in page tables without
> > mapping  a lot of anonymous memory. Neither of the two is addressed by
> > your proposal. So if you want to help then try to think about the two
> > issues.
> 
> Your "improvement" is to tear down wider range of memory whereas
> my "improvement" is to ensure that CPU resource is spent for reclaiming memory and
> David's "improvement" is to mitigate unnecessary killing of additional processes.
> Therefore, your "Neither of the two is addressed by your proposal." is pointless.

OK, then we really have to agree to disagree.
Michal Hocko Aug. 20, 2018, 6:07 a.m. UTC | #13
On Sun 19-08-18 16:45:36, David Rientjes wrote:
> 
> > > > > At the risk of continually repeating the same statement, the oom reaper 
> > > > > cannot provide the direct feedback for all possible memory freeing.  
> > > > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > > > but the other problem that I've already shown is the unnecessary oom 
> > > > > killing of additional processes while a thread has already reached 
> > > > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > > > for malloc implementations such as tcmalloc that do not release virtual 
> > > > > memory. 
> > > > 
> > > > But once we know that the exit path is past the point of blocking we can
> > > > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > > > oom_reaper doesn't hide the current victim too early and we can safely
> > > > wait for the exit path to reclaim the rest. So there is a feedback
> > > > channel. I would even do not mind to poll for that state few times -
> > > > similar to polling for the mmap_sem. But it would still be some feedback
> > > > rather than a certain amount of time has passed since the last check.
> > > > 
> > > 
> > > Yes, of course, it would be easy to rely on exit_mmap() to set 
> > > MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> > > when we are assured of forward progress.  What polling are you proposing 
> > > other than a timeout based mechanism to do this?
> > 
> > I was thinking about doing something like the following
> > - oom_reaper checks the amount of victim's memory after it is done with
> >   reaping (e.g. by calling oom_badness before and after). If it wasn't able to
> >   reclaim much then return false and keep retrying with the existing
> >   mechanism
> 
> I'm not sure how you define the threshold to consider what is substantial 
> memory freeing.

If a rule of thumb (few Megs freed or X% of oom_badness reduced) doesn't
really turn out to be working well then we can try to be more clever
e.g. detect for too many ptes to free and wait for those.

> > - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
> >   MMF_OOM_SKIP flag.
> > 
> > > We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> > > complete free_pgtables() for that mm.  The problem is the same: when does 
> > > the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> > > been set in a timely manner?
> > 
> > reuse the current retry policy which is the number of attempts rather
> > than any timeout.
> > 
> > > If this is an argument that the oom reaper should loop checking for 
> > > MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> > > than just setting the jiffies in the mm itself, that's just implementing 
> > > the same thing and doing so in a way where the oom reaper stalls operating 
> > > on a single mm rather than round-robin iterating over mm's in my patch.
> > 
> > I've said earlier that I do not mind doing round robin in the oom repaer
> > but this is certainly more complex than what we do now and I haven't
> > seen any actual example where it would matter. OOM reaper is a safely
> > measure. Nothing should fall apart if it is slow. The primary work
> > should be happening from the exit path anyway.
> 
> The oom reaper will always be unable to free some memory, such as page 
> tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> also can give up early.  The munlock() case is another example.  We 
> experience unnecessary oom killing during free_pgtables() where the 
> single-threaded exit_mmap() is freeing an enormous amount of page tables 
> (usually a malloc implementation such as tcmalloc that does not free 
> virtual memory) and other processes are faulting faster than we can free.  
> It's a combination of a multiprocessor system and a lot of virtual memory 
> from the original victim.  This is the same case as being unable to 
> munlock quickly enough in exit_mmap() to free the memory.
> 
> We must wait until free_pgtables() completes in exit_mmap() before killing 
> additional processes in the large majority (99.96% of cases from my data) 
> of instances where oom livelock does not occur.  In the remainder of 
> situations, livelock has been prevented by what the oom reaper has been 
> able to free.  We can, of course, not do free_pgtables() from the oom 
> reaper.  So my approach was to allow for a reasonable amount of time for 
> the victim to free a lot of memory before declaring that additional 
> processes must be oom killed.  It would be functionally similar to having 
> the oom reaper retry many, many more times than 10 and having a linked 
> list of mm_structs to reap.  I don't care one way or another if it's a 
> timeout based solution or many, many retries that have schedule_timeout() 
> that yields the same time period in the end.

I would really keep the current retry logic with an extension to allow
to keep retrying or hand over to exit_mmap when we know it is past the
last moment of blocking.
David Rientjes Aug. 20, 2018, 9:31 p.m. UTC | #14
On Mon, 20 Aug 2018, Michal Hocko wrote:

> > The oom reaper will always be unable to free some memory, such as page 
> > tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> > also can give up early.  The munlock() case is another example.  We 
> > experience unnecessary oom killing during free_pgtables() where the 
> > single-threaded exit_mmap() is freeing an enormous amount of page tables 
> > (usually a malloc implementation such as tcmalloc that does not free 
> > virtual memory) and other processes are faulting faster than we can free.  
> > It's a combination of a multiprocessor system and a lot of virtual memory 
> > from the original victim.  This is the same case as being unable to 
> > munlock quickly enough in exit_mmap() to free the memory.
> > 
> > We must wait until free_pgtables() completes in exit_mmap() before killing 
> > additional processes in the large majority (99.96% of cases from my data) 
> > of instances where oom livelock does not occur.  In the remainder of 
> > situations, livelock has been prevented by what the oom reaper has been 
> > able to free.  We can, of course, not do free_pgtables() from the oom 
> > reaper.  So my approach was to allow for a reasonable amount of time for 
> > the victim to free a lot of memory before declaring that additional 
> > processes must be oom killed.  It would be functionally similar to having 
> > the oom reaper retry many, many more times than 10 and having a linked 
> > list of mm_structs to reap.  I don't care one way or another if it's a 
> > timeout based solution or many, many retries that have schedule_timeout() 
> > that yields the same time period in the end.
> 
> I would really keep the current retry logic with an extension to allow
> to keep retrying or hand over to exit_mmap when we know it is past the
> last moment of blocking.
> 

Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
and once it reaches a certain threshold, either give up and set 
MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
fine, but obviously I'll be suggesting that the threshold is rather large.  
So if I adjust my patch to be a retry counter rather than timestamp, do 
you have any other reservations?
Tetsuo Handa Aug. 20, 2018, 10:03 p.m. UTC | #15
On 2018/08/20 14:54, Michal Hocko wrote:
>>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>>>> allocating thread pays the cost for reclaiming", both are timeout based back off
>>>> with number of retry attempt with a cap.
>>>
>>> And it is exactly the who pays the price concern I've already tried to
>>> explain that bothers me.
>>
>> Are you aware that we can fall into situation where nobody can pay the price for
>> reclaiming memory?
> 
> I fail to see how this is related to direct vs. kthread oom reaping
> though. Unless the kthread is starved by other means then it can always
> jump in and handle the situation.

I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
I don't care if the OOM reaper kernel thread is starved by something other than
concurrent allocators, as long as that something is doing useful things.

Allocators wait for progress using (almost) busy loop is prone to lockup; they are
not doing useful things. But direct OOM reaping allows allocators avoid lockup and
do useful things.
Michal Hocko Aug. 21, 2018, 6:09 a.m. UTC | #16
On Mon 20-08-18 14:31:04, David Rientjes wrote:
> On Mon, 20 Aug 2018, Michal Hocko wrote:
> 
> > > The oom reaper will always be unable to free some memory, such as page 
> > > tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> > > also can give up early.  The munlock() case is another example.  We 
> > > experience unnecessary oom killing during free_pgtables() where the 
> > > single-threaded exit_mmap() is freeing an enormous amount of page tables 
> > > (usually a malloc implementation such as tcmalloc that does not free 
> > > virtual memory) and other processes are faulting faster than we can free.  
> > > It's a combination of a multiprocessor system and a lot of virtual memory 
> > > from the original victim.  This is the same case as being unable to 
> > > munlock quickly enough in exit_mmap() to free the memory.
> > > 
> > > We must wait until free_pgtables() completes in exit_mmap() before killing 
> > > additional processes in the large majority (99.96% of cases from my data) 
> > > of instances where oom livelock does not occur.  In the remainder of 
> > > situations, livelock has been prevented by what the oom reaper has been 
> > > able to free.  We can, of course, not do free_pgtables() from the oom 
> > > reaper.  So my approach was to allow for a reasonable amount of time for 
> > > the victim to free a lot of memory before declaring that additional 
> > > processes must be oom killed.  It would be functionally similar to having 
> > > the oom reaper retry many, many more times than 10 and having a linked 
> > > list of mm_structs to reap.  I don't care one way or another if it's a 
> > > timeout based solution or many, many retries that have schedule_timeout() 
> > > that yields the same time period in the end.
> > 
> > I would really keep the current retry logic with an extension to allow
> > to keep retrying or hand over to exit_mmap when we know it is past the
> > last moment of blocking.
> > 
> 
> Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> and once it reaches a certain threshold, either give up and set 
> MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> fine, but obviously I'll be suggesting that the threshold is rather large.  
> So if I adjust my patch to be a retry counter rather than timestamp, do 
> you have any other reservations?

It absolutely has to be an internal thing without any user API to be
set. Also I still haven't heard any specific argument why would oom
reaper need to do per-task attempt and loop over all victims on the
list. Maybe you have some examples though.

I believe that we really need to think about the hand over between the
two paths first and only build a more elaborate retry logic on top of
it.
Michal Hocko Aug. 21, 2018, 6:16 a.m. UTC | #17
On Tue 21-08-18 07:03:10, Tetsuo Handa wrote:
> On 2018/08/20 14:54, Michal Hocko wrote:
> >>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> >>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> >>>> allocating thread pays the cost for reclaiming", both are timeout based back off
> >>>> with number of retry attempt with a cap.
> >>>
> >>> And it is exactly the who pays the price concern I've already tried to
> >>> explain that bothers me.
> >>
> >> Are you aware that we can fall into situation where nobody can pay the price for
> >> reclaiming memory?
> > 
> > I fail to see how this is related to direct vs. kthread oom reaping
> > though. Unless the kthread is starved by other means then it can always
> > jump in and handle the situation.
> 
> I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
> I don't care if the OOM reaper kernel thread is starved by something other than
> concurrent allocators, as long as that something is doing useful things.
> 
> Allocators wait for progress using (almost) busy loop is prone to lockup; they are
> not doing useful things. But direct OOM reaping allows allocators avoid lockup and
> do useful things.

As long as those allocators are making _some_ progress and they are not
preempted themselves. Those might be low priority as well. To make it
more fun those high priority might easily preempt those which try to
make the direct reaping. And if you really want to achieve at least some
fairness there you will quickly grown into a complex scheme. Really our
direct reclaim is already quite fragile when it comes to fairness and
now you want to extend it to be even more fragile. Really, I think you
are not really appreciating what kind of complex beast you are going to
create.

If we have priority inversion problems during oom then we can always
return back to high priority oom reaper. This would be so much simpler.
Tetsuo Handa Aug. 21, 2018, 1:39 p.m. UTC | #18
On 2018/08/21 15:16, Michal Hocko wrote:
> On Tue 21-08-18 07:03:10, Tetsuo Handa wrote:
>> On 2018/08/20 14:54, Michal Hocko wrote:
>>>>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>>>>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>>>>>> allocating thread pays the cost for reclaiming", both are timeout based back off
>>>>>> with number of retry attempt with a cap.
>>>>>
>>>>> And it is exactly the who pays the price concern I've already tried to
>>>>> explain that bothers me.
>>>>
>>>> Are you aware that we can fall into situation where nobody can pay the price for
>>>> reclaiming memory?
>>>
>>> I fail to see how this is related to direct vs. kthread oom reaping
>>> though. Unless the kthread is starved by other means then it can always
>>> jump in and handle the situation.
>>
>> I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
>> I don't care if the OOM reaper kernel thread is starved by something other than
>> concurrent allocators, as long as that something is doing useful things.
>>
>> Allocators wait for progress using (almost) busy loop is prone to lockup; they are
>> not doing useful things. But direct OOM reaping allows allocators avoid lockup and
>> do useful things.
> 
> As long as those allocators are making _some_ progress and they are not
> preempted themselves.

Even on linux-next-20180820 where neither the OOM reaper nor exit_mmap() waits for
oom_lock, a cluster of concurrently allocating realtime threads can make the OOM
victim get MMF_OOM_SKIP in 1800 milliseconds

[  122.291910] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  122.296431] Killed process 1117 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  125.186061] oom_reaper: reaped process 1117 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  125.191487] crond invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  131.405754] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  131.409970] Killed process 1121 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  132.185982] oom_reaper: reaped process 1121 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  132.234704] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  141.396004] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  141.400194] Killed process 1128 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  143.184631] oom_reaper: reaped process 1128 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  143.193077] in:imjournal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  143.721617] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  143.725965] Killed process 1858 (a.out) total-vm:5244kB, anon-rss:1040kB, file-rss:28kB, shmem-rss:0kB
[  145.218808] oom_reaper: reaped process 1858 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  145.223753] systemd-journal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

whereas applying

@@ -440,17 +440,11 @@
 +		ret = true;
 +#ifdef CONFIG_MMU
 +		/*
-+		 * Since the OOM reaper exists, we can safely wait until
-+		 * MMF_OOM_SKIP is set.
++		 * We can safely try to reclaim until MMF_OOM_SKIP is set.
 +		 */
 +		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
-+			if (!oom_reap_target) {
-+				get_task_struct(p);
-+				oom_reap_target = p;
-+				trace_wake_reaper(p->pid);
-+				wake_up(&oom_reaper_wait);
-+			}
-+			continue;
++			if (oom_reap_task_mm(p, mm))
++				set_bit(MMF_OOM_SKIP, &mm->flags);
 +		}
 +#endif
 +		/* We can wait as long as OOM score is decreasing over time. */

on top of this series can make the OOM victim get MMF_OOM_SKIP in 10 milliseconds.

[   43.407032] Out of memory: Kill process 1071 (a.out) score 865 or sacrifice child
[   43.411134] Killed process 1816 (a.out) total-vm:5244kB, anon-rss:1040kB, file-rss:0kB, shmem-rss:0kB
[   43.416427] oom_reaper: reaped process 1816 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[   44.689731] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  159.572877] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  159.576924] Killed process 2158 (first-victim) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  159.580933] oom_reaper: reaped process 2158 (first-victim), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  160.602346] systemd-journal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  160.774149] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  160.778139] Killed process 2159 (a.out) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  160.781779] oom_reaper: reaped process 2159 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  162.745425] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  162.916173] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  162.920239] Killed process 2160 (a.out) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  162.924396] oom_reaper: reaped process 2160 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  166.034599] Gave up waiting for process 2160 (a.out) total-vm:5244kB, anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  166.038446] INFO: lockdep is turned off.
[  166.041780] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

This number shows how much CPU resources we will waste for memory reclaim activities
without any progress. (I'm OK with doing both async reclaim by the OOM reaper kernel
thread and sync reclaim by allocating threads.)

>                       Those might be low priority as well.

It will not cause problems as long as we don't reclaim memory with oom_lock held.

>                                                            To make it
> more fun those high priority might easily preempt those which try to
> make the direct reaping.

Ditto.

>                          And if you really want to achieve at least some
> fairness there you will quickly grown into a complex scheme. Really our
> direct reclaim is already quite fragile when it comes to fairness

You can propose removing direct reclaim.
Then, we will get reliable __GFP_KILLABLE as a bonus.

>                                                                   and
> now you want to extend it to be even more fragile.

Direct OOM reaping is different from direct reclaim that direct OOM reaping
is never blocked on memory allocation. Thus, it will not make more fragile.

>                                                    Really, I think you
> are not really appreciating what kind of complex beast you are going to
> create.

Saying "dragon" or "beast" does not defeat me. Rather, such words drive me
to more and more direct OOM reaping (because you don't like waiting for
oom_lock at __alloc_pages_may_oom() which is a simple way for making sure
that CPU resource is spent for memory reclaim activities).

> 
> If we have priority inversion problems during oom then we can always
> return back to high priority oom reaper. This would be so much simpler.

We could utilize higher scheduling priority for memory reclaim activities
by the OOM reaper kernel thread until MMF_OOM_SKIP is set. But what we need
to think about is how we can wait for memory reclaim activities after
MMF_OOM_SKIP is set. A thread doing exit_mmap() might be idle scheduling
priority. Even if allocating threads found that exit_mmap() already reached
to the point of "no more being blocked on memory allocation", allocating
threads might keep exit_mmap() unable to make progress (for many minutes,
effectively forever) due to idle scheduling priority.

You want to preserve "the fairness destroyer" just because you fear creating
"a new monster". But the point of "no more being blocked on memory allocation"
cannot exist without making sure that CPU resources are spent for memory reclaim
activities. Without seriously considering how we can make sure that allocating
threads give enough CPU resources to memory reclaim activities (both "which the
OOM reaper can do" and "which will be done after the OOM reaper gave up"), your
"hand over" plan will fail. Allocating threads pay the cost for memory reclaim
activities is much simpler way.
David Rientjes Aug. 21, 2018, 5:20 p.m. UTC | #19
On Tue, 21 Aug 2018, Michal Hocko wrote:

> > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > and once it reaches a certain threshold, either give up and set 
> > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > you have any other reservations?
> 
> It absolutely has to be an internal thing without any user API to be
> set. Also I still haven't heard any specific argument why would oom
> reaper need to do per-task attempt and loop over all victims on the
> list. Maybe you have some examples though.
> 

It would be per-mm in this case, the task itself is no longer important 
other than printing to the kernel log.  I think we could simply print that 
the oom reaper has reaped mm->owner.

The oom reaper would need to loop over the per-mm list because the retry 
counter is going to have a high threshold so that processes have the 
ability to free their memory before the oom reaper declares it can no 
longer make forward progress.  We cannot stall trying to reap a single mm 
with a high retry threshold from a memcg hierarchy when another memcg 
hierarchy is also oom.  The ability for one victim to make forward 
progress can depend on a lock held by another oom memcg hierarchy where 
reaping would allow it to be dropped.
Michal Hocko Aug. 22, 2018, 8:03 a.m. UTC | #20
On Tue 21-08-18 10:20:00, David Rientjes wrote:
> On Tue, 21 Aug 2018, Michal Hocko wrote:
> 
> > > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > > and once it reaches a certain threshold, either give up and set 
> > > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > > you have any other reservations?
> > 
> > It absolutely has to be an internal thing without any user API to be
> > set. Also I still haven't heard any specific argument why would oom
> > reaper need to do per-task attempt and loop over all victims on the
> > list. Maybe you have some examples though.
> > 
> 
> It would be per-mm in this case, the task itself is no longer important 
> other than printing to the kernel log.  I think we could simply print that 
> the oom reaper has reaped mm->owner.
> 
> The oom reaper would need to loop over the per-mm list because the retry 
> counter is going to have a high threshold so that processes have the 
> ability to free their memory before the oom reaper declares it can no 
> longer make forward progress.

What do you actually mean by a high threshold?

> We cannot stall trying to reap a single mm 
> with a high retry threshold from a memcg hierarchy when another memcg 
> hierarchy is also oom.  The ability for one victim to make forward 
> progress can depend on a lock held by another oom memcg hierarchy where 
> reaping would allow it to be dropped.

Could you be more specific please?
David Rientjes Aug. 22, 2018, 8:54 p.m. UTC | #21
On Wed, 22 Aug 2018, Michal Hocko wrote:

> > > > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > > > and once it reaches a certain threshold, either give up and set 
> > > > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > > > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > > > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > > > you have any other reservations?
> > > 
> > > It absolutely has to be an internal thing without any user API to be
> > > set. Also I still haven't heard any specific argument why would oom
> > > reaper need to do per-task attempt and loop over all victims on the
> > > list. Maybe you have some examples though.
> > > 
> > 
> > It would be per-mm in this case, the task itself is no longer important 
> > other than printing to the kernel log.  I think we could simply print that 
> > the oom reaper has reaped mm->owner.
> > 
> > The oom reaper would need to loop over the per-mm list because the retry 
> > counter is going to have a high threshold so that processes have the 
> > ability to free their memory before the oom reaper declares it can no 
> > longer make forward progress.
> 
> What do you actually mean by a high threshold?
> 

As suggested in the timeout based approach of my patchset, 10s seems to 
work well for current server memory capacities, so if combined with a 
schedule_timeout(HZ/10) after iterating through mm_struct's to reap, the 
threshold would be best defined so it can allow at least 10s.

> > We cannot stall trying to reap a single mm 
> > with a high retry threshold from a memcg hierarchy when another memcg 
> > hierarchy is also oom.  The ability for one victim to make forward 
> > progress can depend on a lock held by another oom memcg hierarchy where 
> > reaping would allow it to be dropped.
> 
> Could you be more specific please?
> 

It's problematic to stall for 10s trying to oom reap (or free through 
exit_mmap()) a single mm while not trying to free memory from others: if 
you are reaping memory from a memcg subtree's victim and it takes a long 
time, either for a single try with a lot of memory or many tries with 
little or no memory, it increases the likelihood of livelocks in other 
memcg hierarchies because of the oom reaper is not attempting to reap its 
memory.  The victim may depend on a lock that a memory charger is holding 
but the oom reaper is not able to help yet.
Tetsuo Handa Sept. 1, 2018, 11:48 a.m. UTC | #22
On 2018/08/07 5:51, Michal Hocko wrote:
>> At the risk of continually repeating the same statement, the oom reaper 
>> cannot provide the direct feedback for all possible memory freeing.  
>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
>> but the other problem that I've already shown is the unnecessary oom 
>> killing of additional processes while a thread has already reached 
>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
>> for malloc implementations such as tcmalloc that do not release virtual 
>> memory. 
> 
> But once we know that the exit path is past the point of blocking we can
> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> oom_reaper doesn't hide the current victim too early and we can safely
> wait for the exit path to reclaim the rest. So there is a feedback
> channel. I would even do not mind to poll for that state few times -
> similar to polling for the mmap_sem. But it would still be some feedback
> rather than a certain amount of time has passed since the last check.

Michal, will you show us how we can handover as an actual patch? I'm not
happy with postponing current situation with just your wish to handover.
Michal Hocko Sept. 6, 2018, 11:35 a.m. UTC | #23
On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
> On 2018/08/07 5:51, Michal Hocko wrote:
> >> At the risk of continually repeating the same statement, the oom reaper 
> >> cannot provide the direct feedback for all possible memory freeing.  
> >> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> >> but the other problem that I've already shown is the unnecessary oom 
> >> killing of additional processes while a thread has already reached 
> >> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> >> for malloc implementations such as tcmalloc that do not release virtual 
> >> memory. 
> > 
> > But once we know that the exit path is past the point of blocking we can
> > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > oom_reaper doesn't hide the current victim too early and we can safely
> > wait for the exit path to reclaim the rest. So there is a feedback
> > channel. I would even do not mind to poll for that state few times -
> > similar to polling for the mmap_sem. But it would still be some feedback
> > rather than a certain amount of time has passed since the last check.
> 
> Michal, will you show us how we can handover as an actual patch? I'm not
> happy with postponing current situation with just your wish to handover.

I am sorry but I am bussy with other higher priority issues. I believe I
have outlined the scheme that might work (see above). All it takes is to
look into that closer a play with it.

I haven't seen bug reports except for David's very vaguely argued
report. I have asked about details several times but haven't received
any. So I didn't really give it a top priority and consider it as a
corner case which is nice to solve rather than absolutely have to do it
right away because many users would put spell on us because their
workloads are eaten by that evil OOM killer.

If I've misinterpreted the priority then I can certainly reconsider and
reprioritize or somebody else might have a look into it.
Tetsuo Handa Sept. 6, 2018, 11:50 a.m. UTC | #24
On 2018/09/06 20:35, Michal Hocko wrote:
> On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
>> On 2018/08/07 5:51, Michal Hocko wrote:
>>>> At the risk of continually repeating the same statement, the oom reaper 
>>>> cannot provide the direct feedback for all possible memory freeing.  
>>>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
>>>> but the other problem that I've already shown is the unnecessary oom 
>>>> killing of additional processes while a thread has already reached 
>>>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
>>>> for malloc implementations such as tcmalloc that do not release virtual 
>>>> memory. 
>>>
>>> But once we know that the exit path is past the point of blocking we can
>>> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
>>> oom_reaper doesn't hide the current victim too early and we can safely
>>> wait for the exit path to reclaim the rest. So there is a feedback
>>> channel. I would even do not mind to poll for that state few times -
>>> similar to polling for the mmap_sem. But it would still be some feedback
>>> rather than a certain amount of time has passed since the last check.
>>
>> Michal, will you show us how we can handover as an actual patch? I'm not
>> happy with postponing current situation with just your wish to handover.
> 
> I am sorry but I am bussy with other higher priority issues. I believe I
> have outlined the scheme that might work (see above). All it takes is to
> look into that closer a play with it.

If you are too busy, please show "the point of no-blocking" using source code
instead. If such "the point of no-blocking" really exists, it can be executed
by allocating threads. I think that such "the point of no-blocking" is so late
stage of freeing that it makes no difference with timeout based back off.
Michal Hocko Sept. 6, 2018, 12:05 p.m. UTC | #25
On Thu 06-09-18 20:50:53, Tetsuo Handa wrote:
> On 2018/09/06 20:35, Michal Hocko wrote:
> > On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
> >> On 2018/08/07 5:51, Michal Hocko wrote:
> >>>> At the risk of continually repeating the same statement, the oom reaper 
> >>>> cannot provide the direct feedback for all possible memory freeing.  
> >>>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> >>>> but the other problem that I've already shown is the unnecessary oom 
> >>>> killing of additional processes while a thread has already reached 
> >>>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> >>>> for malloc implementations such as tcmalloc that do not release virtual 
> >>>> memory. 
> >>>
> >>> But once we know that the exit path is past the point of blocking we can
> >>> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> >>> oom_reaper doesn't hide the current victim too early and we can safely
> >>> wait for the exit path to reclaim the rest. So there is a feedback
> >>> channel. I would even do not mind to poll for that state few times -
> >>> similar to polling for the mmap_sem. But it would still be some feedback
> >>> rather than a certain amount of time has passed since the last check.
> >>
> >> Michal, will you show us how we can handover as an actual patch? I'm not
> >> happy with postponing current situation with just your wish to handover.
> > 
> > I am sorry but I am bussy with other higher priority issues. I believe I
> > have outlined the scheme that might work (see above). All it takes is to
> > look into that closer a play with it.
> 
> If you are too busy, please show "the point of no-blocking" using source code
> instead. If such "the point of no-blocking" really exists, it can be executed
> by allocating threads.

I would have to study this much deeper but I _suspect_ that we are not
taking any blocking locks right after we return from unmap_vmas. In
other words the place we used to have synchronization with the
oom_reaper in the past.

> I think that such "the point of no-blocking" is so late stage of
> freeing that it makes no difference with timeout based back off.

It is! It is feedback driven rather than a random time passed approach.
And more importantly this syncing with exit_mmap matters only when there
is going to be way much more memory in page tables than in mappings
which is a _rare_ case.
Tetsuo Handa Sept. 6, 2018, 1:40 p.m. UTC | #26
On 2018/09/06 21:05, Michal Hocko wrote:
>> If you are too busy, please show "the point of no-blocking" using source code
>> instead. If such "the point of no-blocking" really exists, it can be executed
>> by allocating threads.
> 
> I would have to study this much deeper but I _suspect_ that we are not
> taking any blocking locks right after we return from unmap_vmas. In
> other words the place we used to have synchronization with the
> oom_reaper in the past.

See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
complaining"). Since this dependency is inode-based (i.e. irrelevant with
OOM victims), waiting for this lock can livelock.

So, where is safe "the point of no-blocking" ?
Michal Hocko Sept. 6, 2018, 1:56 p.m. UTC | #27
On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> On 2018/09/06 21:05, Michal Hocko wrote:
> >> If you are too busy, please show "the point of no-blocking" using source code
> >> instead. If such "the point of no-blocking" really exists, it can be executed
> >> by allocating threads.
> > 
> > I would have to study this much deeper but I _suspect_ that we are not
> > taking any blocking locks right after we return from unmap_vmas. In
> > other words the place we used to have synchronization with the
> > oom_reaper in the past.
> 
> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> OOM victims), waiting for this lock can livelock.
> 
> So, where is safe "the point of no-blocking" ?

Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
have to think about that some more. Maybe we can split those into two parts.
Tetsuo Handa Sept. 6, 2018, 2:06 p.m. UTC | #28
On 2018/09/06 22:56, Michal Hocko wrote:
> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
>> On 2018/09/06 21:05, Michal Hocko wrote:
>>>> If you are too busy, please show "the point of no-blocking" using source code
>>>> instead. If such "the point of no-blocking" really exists, it can be executed
>>>> by allocating threads.
>>>
>>> I would have to study this much deeper but I _suspect_ that we are not
>>> taking any blocking locks right after we return from unmap_vmas. In
>>> other words the place we used to have synchronization with the
>>> oom_reaper in the past.
>>
>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
>> OOM victims), waiting for this lock can livelock.
>>
>> So, where is safe "the point of no-blocking" ?
> 
> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> have to think about that some more. Maybe we can split those into two parts.
> 

Meanwhile, I'd really like to use timeout based back off. Like I wrote at
http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
we need to wait for some period after all.

We can replace timeout based back off after we got safe "the point of no-blocking" .
Michal Hocko Sept. 6, 2018, 2:16 p.m. UTC | #29
On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
> On 2018/09/06 22:56, Michal Hocko wrote:
> > On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> >> On 2018/09/06 21:05, Michal Hocko wrote:
> >>>> If you are too busy, please show "the point of no-blocking" using source code
> >>>> instead. If such "the point of no-blocking" really exists, it can be executed
> >>>> by allocating threads.
> >>>
> >>> I would have to study this much deeper but I _suspect_ that we are not
> >>> taking any blocking locks right after we return from unmap_vmas. In
> >>> other words the place we used to have synchronization with the
> >>> oom_reaper in the past.
> >>
> >> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> >> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> >> OOM victims), waiting for this lock can livelock.
> >>
> >> So, where is safe "the point of no-blocking" ?
> > 
> > Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> > have to think about that some more. Maybe we can split those into two parts.
> > 
> 
> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
> we need to wait for some period after all.
> 
> We can replace timeout based back off after we got safe "the point of no-blocking" .

Why don't you invest your time in the long term solution rather than
playing with something that doesn't solve anything just papers over the
issue?
Tetsuo Handa Sept. 6, 2018, 9:13 p.m. UTC | #30
On 2018/09/06 23:16, Michal Hocko wrote:
> On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
>> On 2018/09/06 22:56, Michal Hocko wrote:
>>> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
>>>> On 2018/09/06 21:05, Michal Hocko wrote:
>>>>>> If you are too busy, please show "the point of no-blocking" using source code
>>>>>> instead. If such "the point of no-blocking" really exists, it can be executed
>>>>>> by allocating threads.
>>>>>
>>>>> I would have to study this much deeper but I _suspect_ that we are not
>>>>> taking any blocking locks right after we return from unmap_vmas. In
>>>>> other words the place we used to have synchronization with the
>>>>> oom_reaper in the past.
>>>>
>>>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
>>>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
>>>> OOM victims), waiting for this lock can livelock.
>>>>
>>>> So, where is safe "the point of no-blocking" ?
>>>
>>> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
>>> have to think about that some more. Maybe we can split those into two parts.
>>>
>>
>> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
>> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
>> we need to wait for some period after all.
>>
>> We can replace timeout based back off after we got safe "the point of no-blocking" .
> 
> Why don't you invest your time in the long term solution rather than
> playing with something that doesn't solve anything just papers over the
> issue?
> 

I am not a MM people. I am a secure programmer from security subsystem.
You are almost always introducing bugs (like you call dragons) rather
than starting from safe changes. The OOM killer _is_ always racy. Even
your what you think the long term solution _shall be_ racy. I can't
waste my time in what you think the long term solution. Please don't
refuse/ignore my (or David's) patches without your counter patches.
Michal Hocko Sept. 7, 2018, 11:10 a.m. UTC | #31
On Fri 07-09-18 06:13:13, Tetsuo Handa wrote:
> On 2018/09/06 23:16, Michal Hocko wrote:
> > On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
> >> On 2018/09/06 22:56, Michal Hocko wrote:
> >>> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> >>>> On 2018/09/06 21:05, Michal Hocko wrote:
> >>>>>> If you are too busy, please show "the point of no-blocking" using source code
> >>>>>> instead. If such "the point of no-blocking" really exists, it can be executed
> >>>>>> by allocating threads.
> >>>>>
> >>>>> I would have to study this much deeper but I _suspect_ that we are not
> >>>>> taking any blocking locks right after we return from unmap_vmas. In
> >>>>> other words the place we used to have synchronization with the
> >>>>> oom_reaper in the past.
> >>>>
> >>>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> >>>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> >>>> OOM victims), waiting for this lock can livelock.
> >>>>
> >>>> So, where is safe "the point of no-blocking" ?
> >>>
> >>> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> >>> have to think about that some more. Maybe we can split those into two parts.
> >>>
> >>
> >> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
> >> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
> >> we need to wait for some period after all.
> >>
> >> We can replace timeout based back off after we got safe "the point of no-blocking" .
> > 
> > Why don't you invest your time in the long term solution rather than
> > playing with something that doesn't solve anything just papers over the
> > issue?
> > 
> 
> I am not a MM people. I am a secure programmer from security subsystem.

OK, so let me be completely honest with you. You have pretty strong
statements about the MM code while you are not considering yourself an
MM person. You are suggesting hacks which do not solve real underlying
problems and I will keep shooting those down.

> You are almost always introducing bugs (like you call dragons) rather
> than starting from safe changes. The OOM killer _is_ always racy. Even
> your what you think the long term solution _shall be_ racy. 

The reason why this area is so easy to to get wrong is basically a lack
of comprehensible design.  We have historical hacks here and there. I
really do not want to follow that direction and as long as my word has
some weigh (which is not my decision of course) I will keep fighting
for simplifications and an overall design refinements. If we are to add
heuristic they should be backed by well understood workloads we do care
about.

You might have your toy workload that hits different corner cases and
testing those is fine. But I absolutely disagree to base any non trivial
changes for that kind of workload unless they are a general improvement.

If you disagree then we have to agree to disagree and it doesn't make
much sense to continue in discussion.

> I can't waste my time in what you think the long term solution. Please
> don't refuse/ignore my (or David's) patches without your counter
> patches.

If you do not care about long term sanity of the code and if you do not
care about a larger picture then I am not interested in any patches from
you. MM code is far from trivial and no playground. This attitude of
yours is just dangerous.
Tetsuo Handa Sept. 7, 2018, 11:36 a.m. UTC | #32
On 2018/09/07 20:10, Michal Hocko wrote:
>> I can't waste my time in what you think the long term solution. Please
>> don't refuse/ignore my (or David's) patches without your counter
>> patches.
> 
> If you do not care about long term sanity of the code and if you do not
> care about a larger picture then I am not interested in any patches from
> you. MM code is far from trivial and no playground. This attitude of
> yours is just dangerous.
> 

Then, please explain how we guarantee that enough CPU resource is spent
between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
was already set" so that last second allocation with high watermark can't fail
when 50% of available memory was already reclaimed.
Michal Hocko Sept. 7, 2018, 11:51 a.m. UTC | #33
On Fri 07-09-18 20:36:31, Tetsuo Handa wrote:
> On 2018/09/07 20:10, Michal Hocko wrote:
> >> I can't waste my time in what you think the long term solution. Please
> >> don't refuse/ignore my (or David's) patches without your counter
> >> patches.
> > 
> > If you do not care about long term sanity of the code and if you do not
> > care about a larger picture then I am not interested in any patches from
> > you. MM code is far from trivial and no playground. This attitude of
> > yours is just dangerous.
> > 
> 
> Then, please explain how we guarantee that enough CPU resource is spent
> between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
> was already set" so that last second allocation with high watermark can't fail
> when 50% of available memory was already reclaimed.

There is no guarantee. Full stop! This is an inherently racy land. We
can strive to work reasonably well but this will never be perfect. And
no, no timeout is going to solve it either. We have to live with the
fact that sometimes we hit the race and kill an additional task. As long
as there are no reasonable workloads which hit this race then we are
good enough.

The only guarantee we can talk about is the forward progress guarantee.
If we know that exit_mmap is past the blocking point then we can hand
over MMF_OOM_SKIP setting to the exit path rather than oom_reaper. Last
moment (minute, milisecond, nanosecond for that matter) allocation is in
no way related or solveable without a strong locking and we have learned
this is not a good idea in the past.

This is nothing new though. This discussion is not moving forward. It
just burns time so this is my last email in this thread.
Tetsuo Handa Sept. 7, 2018, 1:30 p.m. UTC | #34
On 2018/09/07 20:51, Michal Hocko wrote:
> On Fri 07-09-18 20:36:31, Tetsuo Handa wrote:
>> On 2018/09/07 20:10, Michal Hocko wrote:
>>>> I can't waste my time in what you think the long term solution. Please
>>>> don't refuse/ignore my (or David's) patches without your counter
>>>> patches.
>>>
>>> If you do not care about long term sanity of the code and if you do not
>>> care about a larger picture then I am not interested in any patches from
>>> you. MM code is far from trivial and no playground. This attitude of
>>> yours is just dangerous.
>>>
>>
>> Then, please explain how we guarantee that enough CPU resource is spent
>> between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
>> was already set" so that last second allocation with high watermark can't fail
>> when 50% of available memory was already reclaimed.
> 
> There is no guarantee. Full stop! This is an inherently racy land. We
> can strive to work reasonably well but this will never be perfect.

That is enough explanation that we have no choice but mitigate it using
heuristics. No feedback based approach is possible. My or David's patch
has been justified. Thank you!

Patch
diff mbox series

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 589fe78..70c7dfd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1174,6 +1174,9 @@  struct task_struct {
 #endif
 	int				pagefault_disabled;
 	struct list_head		oom_victim_list;
+	unsigned long			last_oom_compared;
+	unsigned long			last_oom_score;
+	unsigned char			oom_reap_stall_count;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 783f04d..7cad886 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -49,6 +49,12 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -230,8 +236,7 @@  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
@@ -571,15 +576,6 @@  static void oom_reap_task(struct task_struct *tsk)
 	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
-	if (attempts <= MAX_OOM_REAP_RETRIES ||
-	    test_bit(MMF_OOM_SKIP, &mm->flags))
-		goto done;
-
-	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-		task_pid_nr(tsk), tsk->comm);
-	debug_show_all_locks();
-
-done:
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
@@ -631,6 +627,9 @@  static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
+		tsk->last_oom_compared = jiffies;
+		tsk->last_oom_score = oom_victim_mm_score(mm);
+		tsk->oom_reap_stall_count = 0;
 		get_task_struct(tsk);
 		list_add(&tsk->oom_victim_list, &oom_victim_list);
 	}
@@ -867,7 +866,6 @@  static void __oom_kill_process(struct task_struct *victim)
 	mmdrop(mm);
 	put_task_struct(victim);
 }
-#undef K
 
 /*
  * Kill provided task unless it's secured by setting
@@ -999,33 +997,74 @@  int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
+{
+	unsigned long score;
+
+	if (time_before(jiffies, p->last_oom_compared + HZ / 10))
+		return false;
+	score = oom_victim_mm_score(mm);
+	if (score < p->last_oom_score)
+		p->oom_reap_stall_count = 0;
+	else
+		p->oom_reap_stall_count++;
+	p->last_oom_score = oom_victim_mm_score(mm);
+	p->last_oom_compared = jiffies;
+	if (p->oom_reap_stall_count < 30)
+		return false;
+	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(p), p->comm, K(mm->total_vm),
+		K(get_mm_counter(mm, MM_ANONPAGES)),
+		K(get_mm_counter(mm, MM_FILEPAGES)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES)));
+	return true;
+}
+
 static bool oom_has_pending_victims(struct oom_control *oc)
 {
-	struct task_struct *p;
+	struct task_struct *p, *tmp;
+	bool ret = false;
+	bool gaveup = false;
 
 	if (is_sysrq_oom(oc))
 		return false;
 	/*
-	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
-	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
-	 * completes.
+	 * Wait for pending victims until __mmput() completes or stalled
+	 * too long.
 	 */
-	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		struct mm_struct *mm = p->signal->oom_mm;
+
 		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
 			continue;
-		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
+		ret = true;
 #ifdef CONFIG_MMU
+		/*
+		 * Since the OOM reaper exists, we can safely wait until
+		 * MMF_OOM_SKIP is set.
+		 */
+		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
 			if (!oom_reap_target) {
 				get_task_struct(p);
 				oom_reap_target = p;
 				trace_wake_reaper(p->pid);
 				wake_up(&oom_reaper_wait);
 			}
-#endif
-			return true;
+			continue;
 		}
+#endif
+		/* We can wait as long as OOM score is decreasing over time. */
+		if (!victim_mm_stalling(p, mm))
+			continue;
+		gaveup = true;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
 	}
-	return false;
+	if (gaveup)
+		debug_show_all_locks();
+
+	return ret;
 }
 
 /**