diff mbox series

mm,oom: Teach lockdep about oom_lock.

Message ID 1552040522-9085-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp (mailing list archive)
State New, archived
Headers show
Series mm,oom: Teach lockdep about oom_lock. | expand

Commit Message

Tetsuo Handa March 8, 2019, 10:22 a.m. UTC
Since we are not allowed to depend on blocking memory allocations when
oom_lock is already held, teach lockdep to consider that blocking memory
allocations might wait for oom_lock at as early location as possible, and
teach lockdep to consider that oom_lock is held by mutex_lock() than by
mutex_trylock().

Also, since the OOM killer is disabled until the OOM reaper or exit_mmap()
sets MMF_OOM_SKIP, teach lockdep to consider that oom_lock is held when
__oom_reap_task_mm() is called.

This patch should not cause lockdep splats unless there is somebody doing
dangerous things (e.g. from OOM notifiers, from the OOM reaper).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c   |  9 ++++++++-
 mm/page_alloc.c | 13 +++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

Comments

Michal Hocko March 8, 2019, 11:03 a.m. UTC | #1
On Fri 08-03-19 19:22:02, Tetsuo Handa wrote:
> Since we are not allowed to depend on blocking memory allocations when
> oom_lock is already held, teach lockdep to consider that blocking memory
> allocations might wait for oom_lock at as early location as possible, and
> teach lockdep to consider that oom_lock is held by mutex_lock() than by
> mutex_trylock().

I do not understand this. It is quite likely that we will have multiple
allocations hitting this path while somebody else might hold the oom
lock.

What kind of problem does this actually want to prevent? Could you be
more specific please?

> Also, since the OOM killer is disabled until the OOM reaper or exit_mmap()
> sets MMF_OOM_SKIP, teach lockdep to consider that oom_lock is held when
> __oom_reap_task_mm() is called.
> 
> This patch should not cause lockdep splats unless there is somebody doing
> dangerous things (e.g. from OOM notifiers, from the OOM reaper).
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  mm/oom_kill.c   |  9 ++++++++-
>  mm/page_alloc.c | 13 +++++++++++++
>  2 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 3a24848..759aa4e 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -513,6 +513,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  	 */
>  	set_bit(MMF_UNSTABLE, &mm->flags);
>  
> +	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
>  	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
>  		if (!can_madv_dontneed_vma(vma))
>  			continue;
> @@ -544,6 +545,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  			tlb_finish_mmu(&tlb, range.start, range.end);
>  		}
>  	}
> +	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
>  
>  	return ret;
>  }
> @@ -1120,8 +1122,13 @@ void pagefault_out_of_memory(void)
>  	if (mem_cgroup_oom_synchronize(true))
>  		return;
>  
> -	if (!mutex_trylock(&oom_lock))
> +	if (!mutex_trylock(&oom_lock)) {
> +		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
> +		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
>  		return;
> +	}
> +	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
> +	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
>  	out_of_memory(&oc);
>  	mutex_unlock(&oom_lock);
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6d0fa5b..25533214 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3793,6 +3793,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>  		schedule_timeout_uninterruptible(1);
>  		return NULL;
>  	}
> +	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
> +	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
>  
>  	/*
>  	 * Go through the zonelist yet one more time, keep very high watermark
> @@ -4651,6 +4653,17 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  	fs_reclaim_acquire(gfp_mask);
>  	fs_reclaim_release(gfp_mask);
>  
> +	/*
> +	 * Allocation requests which can call __alloc_pages_may_oom() might
> +	 * fail to bail out due to waiting for oom_lock.
> +	 */
> +	if ((gfp_mask & __GFP_DIRECT_RECLAIM) && !(gfp_mask & __GFP_NORETRY) &&
> +	    (!(gfp_mask & __GFP_RETRY_MAYFAIL) ||
> +	     order <= PAGE_ALLOC_COSTLY_ORDER)) {
> +		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
> +		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
> +	}
> +
>  	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
>  
>  	if (should_fail_alloc_page(gfp_mask, order))
> -- 
> 1.8.3.1
Tetsuo Handa March 8, 2019, 11:29 a.m. UTC | #2
On 2019/03/08 20:03, Michal Hocko wrote:
> On Fri 08-03-19 19:22:02, Tetsuo Handa wrote:
>> Since we are not allowed to depend on blocking memory allocations when
>> oom_lock is already held, teach lockdep to consider that blocking memory
>> allocations might wait for oom_lock at as early location as possible, and
>> teach lockdep to consider that oom_lock is held by mutex_lock() than by
>> mutex_trylock().
> 
> I do not understand this. It is quite likely that we will have multiple
> allocations hitting this path while somebody else might hold the oom
> lock.

The thread who succeeded to hold oom_lock must not involve blocking memory
allocations. It is explained in the comment before get_page_from_freelist().

> 
> What kind of problem does this actually want to prevent? Could you be
> more specific please?

e.g.

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3688,6 +3688,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
         * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
         * allocation which will never fail due to oom_lock already held.
         */
+       kfree(kmalloc(PAGE_SIZE, GFP_NOIO));
        page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
                                      ~__GFP_DIRECT_RECLAIM, order,
                                      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);


Since https://lore.kernel.org/lkml/20190308013134.GB4063@jagdpanzerIV/T/#u made me
worry that we might by error introduce such dependency in near future, I propose
this change as a proactive protection.
Michal Hocko March 8, 2019, 11:54 a.m. UTC | #3
[Cc Petr for the lockdep part - the patch is
http://lkml.kernel.org/r/1552040522-9085-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp]

On Fri 08-03-19 20:29:46, Tetsuo Handa wrote:
> On 2019/03/08 20:03, Michal Hocko wrote:
> > On Fri 08-03-19 19:22:02, Tetsuo Handa wrote:
> >> Since we are not allowed to depend on blocking memory allocations when
> >> oom_lock is already held, teach lockdep to consider that blocking memory
> >> allocations might wait for oom_lock at as early location as possible, and
> >> teach lockdep to consider that oom_lock is held by mutex_lock() than by
> >> mutex_trylock().
> > 
> > I do not understand this. It is quite likely that we will have multiple
> > allocations hitting this path while somebody else might hold the oom
> > lock.
> 
> The thread who succeeded to hold oom_lock must not involve blocking memory
> allocations. It is explained in the comment before get_page_from_freelist().

Yes this is correct.

> > What kind of problem does this actually want to prevent? Could you be
> > more specific please?
> 
> e.g.
> 
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3688,6 +3688,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
>          * allocation which will never fail due to oom_lock already held.
>          */
> +       kfree(kmalloc(PAGE_SIZE, GFP_NOIO));
>         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
>                                       ~__GFP_DIRECT_RECLAIM, order,
>                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
> 
> 
> Since https://lore.kernel.org/lkml/20190308013134.GB4063@jagdpanzerIV/T/#u made me
> worry that we might by error introduce such dependency in near future, I propose
> this change as a proactive protection.

OK, that makes sense to me. I cannot judge the implementation because I
am not really familiar with lockdep machinery. Could you explain how it
doesn't trigger for all other allocations?

Also why it is not sufficient to add the lockdep annotation prior to the
trylock in __alloc_pages_may_oom?
Michal Hocko March 8, 2019, 11:58 a.m. UTC | #4
On Fri 08-03-19 12:54:13, Michal Hocko wrote:
> [Cc Petr for the lockdep part - the patch is
> http://lkml.kernel.org/r/1552040522-9085-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp]

now for real.

> On Fri 08-03-19 20:29:46, Tetsuo Handa wrote:
> > On 2019/03/08 20:03, Michal Hocko wrote:
> > > On Fri 08-03-19 19:22:02, Tetsuo Handa wrote:
> > >> Since we are not allowed to depend on blocking memory allocations when
> > >> oom_lock is already held, teach lockdep to consider that blocking memory
> > >> allocations might wait for oom_lock at as early location as possible, and
> > >> teach lockdep to consider that oom_lock is held by mutex_lock() than by
> > >> mutex_trylock().
> > > 
> > > I do not understand this. It is quite likely that we will have multiple
> > > allocations hitting this path while somebody else might hold the oom
> > > lock.
> > 
> > The thread who succeeded to hold oom_lock must not involve blocking memory
> > allocations. It is explained in the comment before get_page_from_freelist().
> 
> Yes this is correct.
> 
> > > What kind of problem does this actually want to prevent? Could you be
> > > more specific please?
> > 
> > e.g.
> > 
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3688,6 +3688,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
> >          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
> >          * allocation which will never fail due to oom_lock already held.
> >          */
> > +       kfree(kmalloc(PAGE_SIZE, GFP_NOIO));
> >         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
> >                                       ~__GFP_DIRECT_RECLAIM, order,
> >                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
> > 
> > 
> > Since https://lore.kernel.org/lkml/20190308013134.GB4063@jagdpanzerIV/T/#u made me
> > worry that we might by error introduce such dependency in near future, I propose
> > this change as a proactive protection.
> 
> OK, that makes sense to me. I cannot judge the implementation because I
> am not really familiar with lockdep machinery. Could you explain how it
> doesn't trigger for all other allocations?
> 
> Also why it is not sufficient to add the lockdep annotation prior to the
> trylock in __alloc_pages_may_oom?
> 
> -- 
> Michal Hocko
> SUSE Labs
Peter Zijlstra March 8, 2019, 3:01 p.m. UTC | #5
On Fri, Mar 08, 2019 at 12:58:02PM +0100, Michal Hocko wrote:
> On Fri 08-03-19 12:54:13, Michal Hocko wrote:
> > [Cc Petr for the lockdep part - the patch is
> > http://lkml.kernel.org/r/1552040522-9085-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp]
> 
> now for real.

That really wants a few more comments; I _think_ it is ok, but *shees*.
Michal Hocko March 8, 2019, 3:13 p.m. UTC | #6
On Fri 08-03-19 16:01:05, Peter Zijlstra wrote:
> On Fri, Mar 08, 2019 at 12:58:02PM +0100, Michal Hocko wrote:
> > On Fri 08-03-19 12:54:13, Michal Hocko wrote:
> > > [Cc Petr for the lockdep part - the patch is
> > > http://lkml.kernel.org/r/1552040522-9085-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp]
> > 
> > now for real.
> 
> That really wants a few more comments; I _think_ it is ok, but *shees*.

It would be also great to pull it out of the code flow and hide it
behind a helper static inline. Something like
lockdep_track_oom_alloc_reentrant or a like.
Tetsuo Handa March 9, 2019, 6:02 a.m. UTC | #7
On 2019/03/08 20:58, Michal Hocko wrote:
> OK, that makes sense to me. I cannot judge the implementation because I
> am not really familiar with lockdep machinery. Could you explain how it
> doesn't trigger for all other allocations?

This is same with why fs_reclaim_acquire()/fs_reclaim_release() doesn't trigger
for all other allocations. Any allocation request which might involve __GFP_FS
reclaim passes "struct lockdep_map __fs_reclaim_map", and lockdep records it.

>
> Also why it is not sufficient to add the lockdep annotation prior to the
> trylock in __alloc_pages_may_oom?

This is same with why fs_reclaim_acquire()/fs_reclaim_release() is called from
prepare_alloc_pages(). If an allocation request which might involve __GFP_FS
__perform_reclaim() succeeded before actually calling __perform_reclaim(), we
fail to pass "struct lockdep_map __fs_reclaim_map" (which makes it difficult to
check whether there is possibility of deadlock). Likewise, if an allocation
request which might call __alloc_pages_may_oom() succeeded before actually
calling __alloc_pages_may_oom(), we fail to pass oom_lock.lockdep_map (which
makes it difficult to check whether there is possibility of deadlock).

Strictly speaking, there is

	if (tsk_is_oom_victim(current) &&
	    (alloc_flags == ALLOC_OOM ||
	     (gfp_mask & __GFP_NOMEMALLOC)))
		goto nopage;

case where failing to hold oom_lock at __alloc_pages_may_oom() does not
cause a problem. But I think that we should not check tsk_is_oom_victim()
at prepare_alloc_pages().

> It would be also great to pull it out of the code flow and hide it
> behind a helper static inline. Something like
> lockdep_track_oom_alloc_reentrant or a like.

OK. Here is v2 patch.



From ec8d0accf15b4566c065ca8c63a4e1185f0a0c78 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 9 Mar 2019 09:55:08 +0900
Subject: [PATCH v2] mm,oom: Teach lockdep about oom_lock.

Since a thread which succeeded to hold oom_lock must not involve blocking
memory allocations, teach lockdep to consider that blocking memory
allocations might wait for oom_lock at as early location as possible, and
teach lockdep to consider that oom_lock is held by mutex_lock() than by
mutex_trylock().

Also, since the OOM killer is disabled until the OOM reaper or exit_mmap()
sets MMF_OOM_SKIP, teach lockdep to consider that oom_lock is held when
__oom_reap_task_mm() is called.

This patch should not cause lockdep splats unless there is somebody doing
dangerous things (e.g. from OOM notifiers, from the OOM reaper).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/oom.h | 16 ++++++++++++++++
 mm/oom_kill.c       |  9 ++++++++-
 mm/page_alloc.c     |  5 +++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index d079920..8544c23 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -56,6 +56,22 @@ struct oom_control {
 
 extern struct mutex oom_lock;
 
+static inline void oom_reclaim_acquire(gfp_t gfp_mask, unsigned int order)
+{
+	if ((gfp_mask & __GFP_DIRECT_RECLAIM) && !(gfp_mask & __GFP_NORETRY) &&
+	    (!(gfp_mask & __GFP_RETRY_MAYFAIL) ||
+	     order <= PAGE_ALLOC_COSTLY_ORDER))
+		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
+}
+
+static inline void oom_reclaim_release(gfp_t gfp_mask, unsigned int order)
+{
+	if ((gfp_mask & __GFP_DIRECT_RECLAIM) && !(gfp_mask & __GFP_NORETRY) &&
+	    (!(gfp_mask & __GFP_RETRY_MAYFAIL) ||
+	     order <= PAGE_ALLOC_COSTLY_ORDER))
+		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
+}
+
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flag_origin = true;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a24848..11be7da 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -513,6 +513,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 	 */
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
+	oom_reclaim_acquire(GFP_KERNEL, 0);
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
 		if (!can_madv_dontneed_vma(vma))
 			continue;
@@ -544,6 +545,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 			tlb_finish_mmu(&tlb, range.start, range.end);
 		}
 	}
+	oom_reclaim_release(GFP_KERNEL, 0);
 
 	return ret;
 }
@@ -1120,8 +1122,13 @@ void pagefault_out_of_memory(void)
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	if (!mutex_trylock(&oom_lock))
+	if (!mutex_trylock(&oom_lock)) {
+		oom_reclaim_acquire(GFP_KERNEL, 0);
+		oom_reclaim_release(GFP_KERNEL, 0);
 		return;
+	}
+	oom_reclaim_release(GFP_KERNEL, 0);
+	oom_reclaim_acquire(GFP_KERNEL, 0);
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0fa5b..e8853a19 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3793,6 +3793,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
+	oom_reclaim_release(gfp_mask, order);
+	oom_reclaim_acquire(gfp_mask, order);
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -4651,6 +4653,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 
+	oom_reclaim_acquire(gfp_mask, order);
+	oom_reclaim_release(gfp_mask, order);
+
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
 	if (should_fail_alloc_page(gfp_mask, order))
Michal Hocko March 11, 2019, 10:30 a.m. UTC | #8
On Sat 09-03-19 15:02:22, Tetsuo Handa wrote:
> On 2019/03/08 20:58, Michal Hocko wrote:
> > OK, that makes sense to me. I cannot judge the implementation because I
> > am not really familiar with lockdep machinery. Could you explain how it
> > doesn't trigger for all other allocations?
> 
> This is same with why fs_reclaim_acquire()/fs_reclaim_release() doesn't trigger
> for all other allocations. Any allocation request which might involve __GFP_FS
> reclaim passes "struct lockdep_map __fs_reclaim_map", and lockdep records it.

Yes, exactly. NOFS handling made me go and ask. It uses some nasty hacks
on a dedicated lockdep_map and it was not really clear to me whether
reusing oom_lock's one is always safe.

> > Also why it is not sufficient to add the lockdep annotation prior to the
> > trylock in __alloc_pages_may_oom?
> 
> This is same with why fs_reclaim_acquire()/fs_reclaim_release() is called from
> prepare_alloc_pages(). If an allocation request which might involve __GFP_FS
> __perform_reclaim() succeeded before actually calling __perform_reclaim(), we
> fail to pass "struct lockdep_map __fs_reclaim_map" (which makes it difficult to
> check whether there is possibility of deadlock). Likewise, if an allocation
> request which might call __alloc_pages_may_oom() succeeded before actually
> calling __alloc_pages_may_oom(), we fail to pass oom_lock.lockdep_map (which
> makes it difficult to check whether there is possibility of deadlock).

OK, makes sense to me.

> Strictly speaking, there is
> 
> 	if (tsk_is_oom_victim(current) &&
> 	    (alloc_flags == ALLOC_OOM ||
> 	     (gfp_mask & __GFP_NOMEMALLOC)))
> 		goto nopage;
> 
> case where failing to hold oom_lock at __alloc_pages_may_oom() does not
> cause a problem. But I think that we should not check tsk_is_oom_victim()
> at prepare_alloc_pages().

Yes, I would just preffer the check to be as simple as possible. There
shouldn't be any real reason to put all those conditions in and it would
increase the maintenance burden because anytime we reconsider those oom
rules this would have to be in sync. If the allocation is sleepable then
we should warn because such an allocation is dubious at best.

> > It would be also great to pull it out of the code flow and hide it
> > behind a helper static inline. Something like
> > lockdep_track_oom_alloc_reentrant or a like.
> 
> OK. Here is v2 patch.
> 
> >From ec8d0accf15b4566c065ca8c63a4e1185f0a0c78 Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Date: Sat, 9 Mar 2019 09:55:08 +0900
> Subject: [PATCH v2] mm,oom: Teach lockdep about oom_lock.
> 
> Since a thread which succeeded to hold oom_lock must not involve blocking
> memory allocations, teach lockdep to consider that blocking memory
> allocations might wait for oom_lock at as early location as possible, and
> teach lockdep to consider that oom_lock is held by mutex_lock() than by
> mutex_trylock().

This is still really hard to understand. Especially the last part of the
sentence. The lockdep will know that the lock is held even when going
via trylock. I guess you meant to say that
	mutex_lock(oom_lock)
	  allocation
	    mutex_trylock(oom_lock)
is not caught by the lockdep, right?

> Also, since the OOM killer is disabled until the OOM reaper or exit_mmap()
> sets MMF_OOM_SKIP, teach lockdep to consider that oom_lock is held when
> __oom_reap_task_mm() is called.

It would be good to mention that the oom reaper acts as a guarantee of a
forward progress and as such it cannot depend on any memory allocation
and that is why this context is marked. This would be easier to
understand IMHO.

> This patch should not cause lockdep splats unless there is somebody doing
> dangerous things (e.g. from OOM notifiers, from the OOM reaper).
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/linux/oom.h | 16 ++++++++++++++++
>  mm/oom_kill.c       |  9 ++++++++-
>  mm/page_alloc.c     |  5 +++++
>  3 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index d079920..8544c23 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -56,6 +56,22 @@ struct oom_control {
>  
>  extern struct mutex oom_lock;
>  
> +static inline void oom_reclaim_acquire(gfp_t gfp_mask, unsigned int order)
> +{
> +	if ((gfp_mask & __GFP_DIRECT_RECLAIM) && !(gfp_mask & __GFP_NORETRY) &&
> +	    (!(gfp_mask & __GFP_RETRY_MAYFAIL) ||
> +	     order <= PAGE_ALLOC_COSTLY_ORDER))
> +		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
> +}

that being said why not only make this

	if (gfp_mask & __GFP_DIRECT_RECLAIM)
		mutex_acquire(....)
Peter Zijlstra March 12, 2019, 8:24 a.m. UTC | #9
On Sat, Mar 09, 2019 at 03:02:22PM +0900, Tetsuo Handa wrote:
> @@ -1120,8 +1122,13 @@ void pagefault_out_of_memory(void)
>  	if (mem_cgroup_oom_synchronize(true))
>  		return;
>  
> -	if (!mutex_trylock(&oom_lock))
> +	if (!mutex_trylock(&oom_lock)) {
		/*
		 * Explain why we still need this on the fail back.
		 */
> +		oom_reclaim_acquire(GFP_KERNEL, 0);
> +		oom_reclaim_release(GFP_KERNEL, 0);
>  		return;
> +	}

	/*
	 * This changes the try-lock to a regular lock; because .....
	 * text goes here.
	 */

> +	oom_reclaim_release(GFP_KERNEL, 0);
> +	oom_reclaim_acquire(GFP_KERNEL, 0);
>  	out_of_memory(&oc);
>  	mutex_unlock(&oom_lock);
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6d0fa5b..e8853a19 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3793,6 +3793,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>  		schedule_timeout_uninterruptible(1);
>  		return NULL;
>  	}

idem

> +	oom_reclaim_release(gfp_mask, order);
> +	oom_reclaim_acquire(gfp_mask, order);
>  
>  	/*
>  	 * Go through the zonelist yet one more time, keep very high watermark

these things might seem 'obvious' now, but I'm sure that the next time I
see this I'll go WTF?!
Tetsuo Handa March 12, 2019, 2:06 p.m. UTC | #10
On 2019/03/11 19:30, Michal Hocko wrote:
> On Sat 09-03-19 15:02:22, Tetsuo Handa wrote:
>> Since a thread which succeeded to hold oom_lock must not involve blocking
>> memory allocations, teach lockdep to consider that blocking memory
>> allocations might wait for oom_lock at as early location as possible, and
>> teach lockdep to consider that oom_lock is held by mutex_lock() than by
>> mutex_trylock().
> 
> This is still really hard to understand. Especially the last part of the
> sentence. The lockdep will know that the lock is held even when going
> via trylock. I guess you meant to say that
> 	mutex_lock(oom_lock)
> 	  allocation
> 	    mutex_trylock(oom_lock)
> is not caught by the lockdep, right?

Right.

> 
>> Also, since the OOM killer is disabled until the OOM reaper or exit_mmap()
>> sets MMF_OOM_SKIP, teach lockdep to consider that oom_lock is held when
>> __oom_reap_task_mm() is called.
> 
> It would be good to mention that the oom reaper acts as a guarantee of a
> forward progress and as such it cannot depend on any memory allocation
> and that is why this context is marked. This would be easier to
> understand IMHO.

OK. Here is v3 patch.

From 250bbe28bc3e9946992d960bb90a351a896a543b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 12 Mar 2019 22:58:41 +0900
Subject: [PATCH v3] mm,oom: Teach lockdep about oom_lock.

Since a thread which succeeded to hold oom_lock must not involve blocking
memory allocations, teach lockdep to consider that blocking memory
allocations might wait for oom_lock at as early location as possible.

Lockdep can't detect possibility of deadlock when mutex_trylock(&oom_lock)
failed, for we assume that somebody else is still able to make a forward
progress. Thus, teach lockdep to consider that mutex_trylock(&oom_lock) as
mutex_lock(&oom_lock).

Since the OOM killer is disabled when __oom_reap_task_mm() is in progress,
a thread which is calling __oom_reap_task_mm() must not involve blocking
memory allocations. Thus, teach lockdep about that.

This patch should not cause lockdep splats unless there is somebody doing
dangerous things (e.g. from OOM notifiers, from the OOM reaper).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/oom.h | 12 ++++++++++++
 mm/oom_kill.c       | 28 +++++++++++++++++++++++++++-
 mm/page_alloc.c     | 16 ++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index d079920..04aa46b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -56,6 +56,18 @@ struct oom_control {
 
 extern struct mutex oom_lock;
 
+static inline void oom_reclaim_acquire(gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_DIRECT_RECLAIM)
+		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
+}
+
+static inline void oom_reclaim_release(gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_DIRECT_RECLAIM)
+		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
+}
+
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flag_origin = true;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a24848..6f53bb6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -513,6 +513,14 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 	 */
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
+	/*
+	 * Since this function acts as a guarantee of a forward progress,
+	 * current thread is not allowed to involve (even indirectly via
+	 * dependency) __GFP_DIRECT_RECLAIM && !__GFP_NORETRY allocation from
+	 * this function, for such allocation will have to wait for this
+	 * function to complete when __alloc_pages_may_oom() is called.
+	 */
+	oom_reclaim_acquire(GFP_KERNEL);
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
 		if (!can_madv_dontneed_vma(vma))
 			continue;
@@ -544,6 +552,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 			tlb_finish_mmu(&tlb, range.start, range.end);
 		}
 	}
+	oom_reclaim_release(GFP_KERNEL);
 
 	return ret;
 }
@@ -1120,8 +1129,25 @@ void pagefault_out_of_memory(void)
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	if (!mutex_trylock(&oom_lock))
+	if (!mutex_trylock(&oom_lock)) {
+		/*
+		 * This corresponds to prepare_alloc_pages(). Lockdep will
+		 * complain if e.g. OOM notifier for global OOM by error
+		 * triggered pagefault OOM path.
+		 */
+		oom_reclaim_acquire(GFP_KERNEL);
+		oom_reclaim_release(GFP_KERNEL);
 		return;
+	}
+	/*
+	 * Teach lockdep to consider that current thread is not allowed to
+	 * involve (even indirectly via dependency) __GFP_DIRECT_RECLAIM &&
+	 * !__GFP_NORETRY allocation from this function, for such allocation
+	 * will have to wait for completion of this function when
+	 * __alloc_pages_may_oom() is called.
+	 */
+	oom_reclaim_release(GFP_KERNEL);
+	oom_reclaim_acquire(GFP_KERNEL);
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0fa5b..c23ae76d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3793,6 +3793,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
+	/*
+	 * Teach lockdep to consider that current thread is not allowed to
+	 * involve (even indirectly via dependency) __GFP_DIRECT_RECLAIM &&
+	 * !__GFP_NORETRY allocation from this context, for such allocation
+	 * will have to wait for this function to complete.
+	 */
+	oom_reclaim_release(gfp_mask);
+	oom_reclaim_acquire(gfp_mask);
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -4651,6 +4659,14 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 
+	/*
+	 * Since __GFP_DIRECT_RECLAIM && !__GFP_NORETRY allocation might call
+	 * __alloc_pages_may_oom(), teach lockdep to record that current thread
+	 * might forever retry until holding oom_lock succeeds.
+	 */
+	oom_reclaim_acquire(gfp_mask);
+	oom_reclaim_release(gfp_mask);
+
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
 	if (should_fail_alloc_page(gfp_mask, order))
Michal Hocko March 12, 2019, 3:31 p.m. UTC | #11
On Tue 12-03-19 23:06:33, Tetsuo Handa wrote:
[...]
> >From 250bbe28bc3e9946992d960bb90a351a896a543b Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Date: Tue, 12 Mar 2019 22:58:41 +0900
> Subject: [PATCH v3] mm,oom: Teach lockdep about oom_lock.
> 
> Since a thread which succeeded to hold oom_lock must not involve blocking
> memory allocations, teach lockdep to consider that blocking memory
> allocations might wait for oom_lock at as early location as possible.
>
> Lockdep can't detect possibility of deadlock when mutex_trylock(&oom_lock)
> failed, for we assume that somebody else is still able to make a forward
> progress. Thus, teach lockdep to consider that mutex_trylock(&oom_lock) as
> mutex_lock(&oom_lock).
> 
> Since the OOM killer is disabled when __oom_reap_task_mm() is in progress,
> a thread which is calling __oom_reap_task_mm() must not involve blocking
> memory allocations. Thus, teach lockdep about that.
> 
> This patch should not cause lockdep splats unless there is somebody doing
> dangerous things (e.g. from OOM notifiers, from the OOM reaper).

This is obviously subjective but I find this still really hard to
understand.  What about
"
OOM killer path which holds oom_lock is not allowed to invoke any
blocking allocation because this might lead to a deadlock because any
forward progress is blocked because __alloc_pages_may_oom always bail
out on the trylock and the current lockdep implementation doesn't
recognize that as a potential deadlock. Teach the lockdep infrastructure
about this dependency and warn as early in the allocation path as
possible.

The implementation basically mimics GFP_NOFS/GFP_NOIO lockdep
implementation except that oom_lock dependency map is used directly.

Please note that oom_reaper guarantees a forward progress in case the
oom victim cannot exit on its own and as such it cannot depend on any
blocking allocation as well. Therefore mark its execution as if it was
holding the oom_lock as well.
"
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/linux/oom.h | 12 ++++++++++++
>  mm/oom_kill.c       | 28 +++++++++++++++++++++++++++-
>  mm/page_alloc.c     | 16 ++++++++++++++++
>  3 files changed, 55 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index d079920..04aa46b 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -56,6 +56,18 @@ struct oom_control {
>  
>  extern struct mutex oom_lock;
>  
> +static inline void oom_reclaim_acquire(gfp_t gfp_mask)
> +{
> +	if (gfp_mask & __GFP_DIRECT_RECLAIM)
> +		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
> +}
> +
> +static inline void oom_reclaim_release(gfp_t gfp_mask)
> +{
> +	if (gfp_mask & __GFP_DIRECT_RECLAIM)
> +		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
> +}
> +
>  static inline void set_current_oom_origin(void)
>  {
>  	current->signal->oom_flag_origin = true;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 3a24848..6f53bb6 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -513,6 +513,14 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  	 */
>  	set_bit(MMF_UNSTABLE, &mm->flags);
>  
> +	/*
> +	 * Since this function acts as a guarantee of a forward progress,
> +	 * current thread is not allowed to involve (even indirectly via
> +	 * dependency) __GFP_DIRECT_RECLAIM && !__GFP_NORETRY allocation from

As already pointed out in the previous email, the less specific about
gfp flags you are the better longterm. I would stick with a "blocking
allocation"

> +	 * this function, for such allocation will have to wait for this
> +	 * function to complete when __alloc_pages_may_oom() is called.
> +	 */
> +	oom_reclaim_acquire(GFP_KERNEL);
>  	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
>  		if (!can_madv_dontneed_vma(vma))
>  			continue;
> @@ -544,6 +552,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  			tlb_finish_mmu(&tlb, range.start, range.end);
>  		}
>  	}
> +	oom_reclaim_release(GFP_KERNEL);
>  
>  	return ret;
>  }
> @@ -1120,8 +1129,25 @@ void pagefault_out_of_memory(void)
>  	if (mem_cgroup_oom_synchronize(true))
>  		return;
>  
> -	if (!mutex_trylock(&oom_lock))
> +	if (!mutex_trylock(&oom_lock)) {
> +		/*
> +		 * This corresponds to prepare_alloc_pages(). Lockdep will
> +		 * complain if e.g. OOM notifier for global OOM by error
> +		 * triggered pagefault OOM path.
> +		 */
> +		oom_reclaim_acquire(GFP_KERNEL);
> +		oom_reclaim_release(GFP_KERNEL);
>  		return;
> +	}
> +	/*
> +	 * Teach lockdep to consider that current thread is not allowed to
> +	 * involve (even indirectly via dependency) __GFP_DIRECT_RECLAIM &&
> +	 * !__GFP_NORETRY allocation from this function, for such allocation
> +	 * will have to wait for completion of this function when
> +	 * __alloc_pages_may_oom() is called.
> +	 */
> +	oom_reclaim_release(GFP_KERNEL);
> +	oom_reclaim_acquire(GFP_KERNEL);

This part is not really clear to me. Why do you release&acquire when
mutex_trylock just acquire the lock? If this is really needed then this
should be put into the comment.

>  	out_of_memory(&oc);
>  	mutex_unlock(&oom_lock);
>  }
Tetsuo Handa March 14, 2019, 1:55 p.m. UTC | #12
On 2019/03/13 0:31, Michal Hocko wrote:
>> @@ -1120,8 +1129,25 @@ void pagefault_out_of_memory(void)
>>  	if (mem_cgroup_oom_synchronize(true))
>>  		return;
>>  
>> -	if (!mutex_trylock(&oom_lock))
>> +	if (!mutex_trylock(&oom_lock)) {
>> +		/*
>> +		 * This corresponds to prepare_alloc_pages(). Lockdep will
>> +		 * complain if e.g. OOM notifier for global OOM by error
>> +		 * triggered pagefault OOM path.
>> +		 */
>> +		oom_reclaim_acquire(GFP_KERNEL);
>> +		oom_reclaim_release(GFP_KERNEL);
>>  		return;
>> +	}
>> +	/*
>> +	 * Teach lockdep to consider that current thread is not allowed to
>> +	 * involve (even indirectly via dependency) __GFP_DIRECT_RECLAIM &&
>> +	 * !__GFP_NORETRY allocation from this function, for such allocation
>> +	 * will have to wait for completion of this function when
>> +	 * __alloc_pages_may_oom() is called.
>> +	 */
>> +	oom_reclaim_release(GFP_KERNEL);
>> +	oom_reclaim_acquire(GFP_KERNEL);
> 
> This part is not really clear to me. Why do you release&acquire when
> mutex_trylock just acquire the lock? If this is really needed then this
> should be put into the comment.

I think there is a reason lockdep needs to distinguish trylock and lock.
I don't know how lockdep utilizes "trylock or lock" information upon validation, but
explicitly telling lockdep that "oom_lock acts as if held by lock" should not harm.

#define mutex_acquire(l, s, t, i)               lock_acquire_exclusive(l, s, t, NULL, i)
#define lock_acquire_exclusive(l, s, t, n, i)           lock_acquire(l, s, t, 0, 1, n, i)
void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip);

> 
>>  	out_of_memory(&oc);
>>  	mutex_unlock(&oom_lock);
>>  }
>
diff mbox series

Patch

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a24848..759aa4e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -513,6 +513,7 @@  bool __oom_reap_task_mm(struct mm_struct *mm)
 	 */
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
+	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
 		if (!can_madv_dontneed_vma(vma))
 			continue;
@@ -544,6 +545,7 @@  bool __oom_reap_task_mm(struct mm_struct *mm)
 			tlb_finish_mmu(&tlb, range.start, range.end);
 		}
 	}
+	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
 
 	return ret;
 }
@@ -1120,8 +1122,13 @@  void pagefault_out_of_memory(void)
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	if (!mutex_trylock(&oom_lock))
+	if (!mutex_trylock(&oom_lock)) {
+		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
+		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
 		return;
+	}
+	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
+	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0fa5b..25533214 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3793,6 +3793,8 @@  void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
+	mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
+	mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -4651,6 +4653,17 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 
+	/*
+	 * Allocation requests which can call __alloc_pages_may_oom() might
+	 * fail to bail out due to waiting for oom_lock.
+	 */
+	if ((gfp_mask & __GFP_DIRECT_RECLAIM) && !(gfp_mask & __GFP_NORETRY) &&
+	    (!(gfp_mask & __GFP_RETRY_MAYFAIL) ||
+	     order <= PAGE_ALLOC_COSTLY_ORDER)) {
+		mutex_acquire(&oom_lock.dep_map, 0, 0, _THIS_IP_);
+		mutex_release(&oom_lock.dep_map, 1, _THIS_IP_);
+	}
+
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
 	if (should_fail_alloc_page(gfp_mask, order))