diff mbox series

[RESEND,v2,2/2] mm: delete unused MMF_OOM_VICTIM flag

Message ID 20220531223100.510392-2-surenb@google.com (mailing list archive)
State Accepted
Commit b3541d912a84dc40cabb516f2deeac9ae6fa30da
Headers show
Series [RESEND,v2,1/2] mm: drop oom code from exit_mmap | expand

Commit Message

Suren Baghdasaryan May 31, 2022, 10:31 p.m. UTC
With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
now unused and can be removed.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/oom.h            | 9 ---------
 include/linux/sched/coredump.h | 7 +++----
 mm/oom_kill.c                  | 4 +---
 3 files changed, 4 insertions(+), 16 deletions(-)

Comments

Andrew Morton Aug. 22, 2022, 10:21 p.m. UTC | #1
On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote:

> With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> now unused and can be removed.
> 
> ...
>
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
>  	return tsk->signal->oom_mm;
>  }
>  
> -/*
> - * Use this helper if tsk->mm != mm and the victim mm needs a special
> - * handling. This is guaranteed to stay true after once set.
> - */
> -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> -{
> -	return test_bit(MMF_OOM_VICTIM, &mm->flags);
> -}
> -

The patch "mm: multi-gen LRU: support page table walks" from the MGLRU
series
(https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com)
adds two calls to mm_is_oom_victim(), so my build broke.

I assume the fix is simply

--- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
+++ a/mm/vmscan.c
@@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
 	if (size < MIN_LRU_BATCH)
 		return true;
 
-	if (mm_is_oom_victim(mm))
-		return true;
-
 	return !mmget_not_zero(mm);
 }
 
@@ -4127,9 +4124,6 @@ restart:
 
 		walk_pmd_range(&val, addr, next, args);
 
-		if (mm_is_oom_victim(args->mm))
-			return 1;
-
 		/* a racy check to curtail the waiting time */
 		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
 			return 1;
Yu Zhao Aug. 22, 2022, 10:33 p.m. UTC | #2
On Mon, Aug 22, 2022 at 4:21 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote:
>
> > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> > now unused and can be removed.
> >
> > ...
> >
> > --- a/include/linux/oom.h
> > +++ b/include/linux/oom.h
> > @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
> >       return tsk->signal->oom_mm;
> >  }
> >
> > -/*
> > - * Use this helper if tsk->mm != mm and the victim mm needs a special
> > - * handling. This is guaranteed to stay true after once set.
> > - */
> > -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> > -{
> > -     return test_bit(MMF_OOM_VICTIM, &mm->flags);
> > -}
> > -
>
> The patch "mm: multi-gen LRU: support page table walks" from the MGLRU
> series
> (https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com)
> adds two calls to mm_is_oom_victim(), so my build broke.
>
> I assume the fix is simply
>
> --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> +++ a/mm/vmscan.c
> @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
>         if (size < MIN_LRU_BATCH)
>                 return true;
>
> -       if (mm_is_oom_victim(mm))
> -               return true;
> -
>         return !mmget_not_zero(mm);
>  }
>
> @@ -4127,9 +4124,6 @@ restart:
>
>                 walk_pmd_range(&val, addr, next, args);
>
> -               if (mm_is_oom_victim(args->mm))
> -                       return 1;
> -
>                 /* a racy check to curtail the waiting time */
>                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
>                         return 1;
> _
>
> Please confirm?

LGTM.  The deleted checks are not about correctness.

I've queued

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
struct lru_gen_mm_walk *walk)
        if (size < MIN_LRU_BATCH)
                return true;

-       if (mm_is_oom_victim(mm))
+       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
                return true;

        return !mmget_not_zero(mm);
@@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
long start, unsigned long end,

                walk_pmd_range(&val, addr, next, args);

-               if (mm_is_oom_victim(args->mm))
+               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
                        return 1;

                /* a racy check to curtail the waiting time */
Andrew Morton Aug. 22, 2022, 10:48 p.m. UTC | #3
On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote:

> > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> > +++ a/mm/vmscan.c
> > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
> >         if (size < MIN_LRU_BATCH)
> >                 return true;
> >
> > -       if (mm_is_oom_victim(mm))
> > -               return true;
> > -
> >         return !mmget_not_zero(mm);
> >  }
> >
> > @@ -4127,9 +4124,6 @@ restart:
> >
> >                 walk_pmd_range(&val, addr, next, args);
> >
> > -               if (mm_is_oom_victim(args->mm))
> > -                       return 1;
> > -
> >                 /* a racy check to curtail the waiting time */
> >                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
> >                         return 1;
> > _
> >
> > Please confirm?
> 
> LGTM.  The deleted checks are not about correctness.

OK, for now.

> I've queued
> 
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
> struct lru_gen_mm_walk *walk)
>         if (size < MIN_LRU_BATCH)
>                 return true;
> 
> -       if (mm_is_oom_victim(mm))
> +       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
>                 return true;
> 
>         return !mmget_not_zero(mm);
> @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> long start, unsigned long end,
> 
>                 walk_pmd_range(&val, addr, next, args);
> 
> -               if (mm_is_oom_victim(args->mm))
> +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
>                         return 1;
> 
>                 /* a racy check to curtail the waiting time */

Oh.  Why?  What does this change do?
Yu Zhao Aug. 22, 2022, 10:59 p.m. UTC | #4
On Mon, Aug 22, 2022 at 4:48 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> > > +++ a/mm/vmscan.c
> > > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
> > >         if (size < MIN_LRU_BATCH)
> > >                 return true;
> > >
> > > -       if (mm_is_oom_victim(mm))
> > > -               return true;
> > > -
> > >         return !mmget_not_zero(mm);
> > >  }
> > >
> > > @@ -4127,9 +4124,6 @@ restart:
> > >
> > >                 walk_pmd_range(&val, addr, next, args);
> > >
> > > -               if (mm_is_oom_victim(args->mm))
> > > -                       return 1;
> > > -
> > >                 /* a racy check to curtail the waiting time */
> > >                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
> > >                         return 1;
> > > _
> > >
> > > Please confirm?
> >
> > LGTM.  The deleted checks are not about correctness.
>
> OK, for now.
>
> > I've queued
> >
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
> > struct lru_gen_mm_walk *walk)
> >         if (size < MIN_LRU_BATCH)
> >                 return true;
> >
> > -       if (mm_is_oom_victim(mm))
> > +       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
> >                 return true;
> >
> >         return !mmget_not_zero(mm);
> > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > long start, unsigned long end,
> >
> >                 walk_pmd_range(&val, addr, next, args);
> >
> > -               if (mm_is_oom_victim(args->mm))
> > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> >                         return 1;
> >
> >                 /* a racy check to curtail the waiting time */
>
> Oh.  Why?  What does this change do?

The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
flag, but it's set at a later stage during an OOM kill.

When either is set, the OOM reaper is probably already freeing the
memory of this mm_struct, or at least it's going to. So there is no
need to dwell on it in the reclaim path, hence not about correctness.
Andrew Morton Aug. 22, 2022, 11:16 p.m. UTC | #5
On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:

> > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > long start, unsigned long end,
> > >
> > >                 walk_pmd_range(&val, addr, next, args);
> > >
> > > -               if (mm_is_oom_victim(args->mm))
> > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > >                         return 1;
> > >
> > >                 /* a racy check to curtail the waiting time */
> >
> > Oh.  Why?  What does this change do?
> 
> The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> flag, but it's set at a later stage during an OOM kill.
> 
> When either is set, the OOM reaper is probably already freeing the
> memory of this mm_struct, or at least it's going to. So there is no
> need to dwell on it in the reclaim path, hence not about correctness.

Thanks.  That sounds worthy of some code comments?
Yu Zhao Aug. 22, 2022, 11:20 p.m. UTC | #6
On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > > long start, unsigned long end,
> > > >
> > > >                 walk_pmd_range(&val, addr, next, args);
> > > >
> > > > -               if (mm_is_oom_victim(args->mm))
> > > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > > >                         return 1;
> > > >
> > > >                 /* a racy check to curtail the waiting time */
> > >
> > > Oh.  Why?  What does this change do?
> >
> > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> > flag, but it's set at a later stage during an OOM kill.
> >
> > When either is set, the OOM reaper is probably already freeing the
> > memory of this mm_struct, or at least it's going to. So there is no
> > need to dwell on it in the reclaim path, hence not about correctness.
>
> Thanks.  That sounds worthy of some code comments?

Will do. Thanks.
Michal Hocko Aug. 23, 2022, 8:36 a.m. UTC | #7
On Mon 22-08-22 17:20:17, Yu Zhao wrote:
> On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:
> >
> > > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > > > long start, unsigned long end,
> > > > >
> > > > >                 walk_pmd_range(&val, addr, next, args);
> > > > >
> > > > > -               if (mm_is_oom_victim(args->mm))
> > > > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > > > >                         return 1;
> > > > >
> > > > >                 /* a racy check to curtail the waiting time */
> > > >
> > > > Oh.  Why?  What does this change do?
> > >
> > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> > > flag, but it's set at a later stage during an OOM kill.
> > >
> > > When either is set, the OOM reaper is probably already freeing the
> > > memory of this mm_struct, or at least it's going to. So there is no
> > > need to dwell on it in the reclaim path, hence not about correctness.
> >
> > Thanks.  That sounds worthy of some code comments?
> 
> Will do. Thanks.

I would rather not see this abuse. You cannot really make any
assumptions about oom_reaper and how quickly it is going to free the
memory. If this is really worth it (and I have to say I doubt it) then
it should be a separate patch with numbers justifying it.
Yu Zhao Aug. 28, 2022, 7:50 p.m. UTC | #8
On Tue, Aug 23, 2022 at 2:36 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 22-08-22 17:20:17, Yu Zhao wrote:
> > On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:
> > >
> > > > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > > > > long start, unsigned long end,
> > > > > >
> > > > > >                 walk_pmd_range(&val, addr, next, args);
> > > > > >
> > > > > > -               if (mm_is_oom_victim(args->mm))
> > > > > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > > > > >                         return 1;
> > > > > >
> > > > > >                 /* a racy check to curtail the waiting time */
> > > > >
> > > > > Oh.  Why?  What does this change do?
> > > >
> > > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> > > > flag, but it's set at a later stage during an OOM kill.
> > > >
> > > > When either is set, the OOM reaper is probably already freeing the
> > > > memory of this mm_struct, or at least it's going to. So there is no
> > > > need to dwell on it in the reclaim path, hence not about correctness.
> > >
> > > Thanks.  That sounds worthy of some code comments?
> >
> > Will do. Thanks.
>
> I would rather not see this abuse.

I understand where you're coming from, however, I don't share this
POV. I see it as cooperation -- the page reclaim and the oom/reaper
can't (or at least shouldn't) operate in isolation.

> You cannot really make any
> assumptions about oom_reaper and how quickly it is going to free the
> memory.

Agreed. But here we are talking about heuristics, not dependencies on
certain behaviors. Assume we are playing a guessing game: there are
multiple mm_structs available for reclaim, would the oom-killed ones
be more profitable on average? I'd say no, because I assume it's more
likely than unlikely that the oom reaper is doing/to do its work. Note
that the assumption is about likelihood, hence arguably valid.

> If this is really worth it (and I have to say I doubt it) then
> it should be a separate patch with numbers justifying it.

I definitely can artificially create a test case that runs oom a few
times per second, to prove this two-liner is beneficial to that
scenario. Then there is the question how much it would benefit the
real-world scenarios.

I'd recommend keeping this two-liner if we still had
mm_is_oom_victim(), because it's simple, clear and intuitive. With
MMF_OOM_REAP_QUEUED, I don't have a strong opinion. Since you do, I'll
just delete it.
Michal Hocko Aug. 29, 2022, 10:40 a.m. UTC | #9
On Sun 28-08-22 13:50:09, Yu Zhao wrote:
> On Tue, Aug 23, 2022 at 2:36 AM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > You cannot really make any
> > assumptions about oom_reaper and how quickly it is going to free the
> > memory.
> 
> Agreed. But here we are talking about heuristics, not dependencies on
> certain behaviors. Assume we are playing a guessing game: there are
> multiple mm_structs available for reclaim, would the oom-killed ones
> be more profitable on average? I'd say no, because I assume it's more
> likely than unlikely that the oom reaper is doing/to do its work. Note
> that the assumption is about likelihood, hence arguably valid.

Well, my main counter argument would be that we do not really want to
carve last resort mechanism (which the oom reaper is) into any heuristic
because any future changes into that mechanism will be much harder to
justify and change. There is a cost of the maintenance that should be
considered. While you might be right that this change would be
beneficial, there is no actual proof of that. Historically we've had
several examples of such a behavior which was really hard to change
later on because the effect would be really hard to evaluate.
Michal Hocko Aug. 29, 2022, 10:45 a.m. UTC | #10
On Mon 29-08-22 12:40:05, Michal Hocko wrote:
> On Sun 28-08-22 13:50:09, Yu Zhao wrote:
> > On Tue, Aug 23, 2022 at 2:36 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > You cannot really make any
> > > assumptions about oom_reaper and how quickly it is going to free the
> > > memory.
> > 
> > Agreed. But here we are talking about heuristics, not dependencies on
> > certain behaviors. Assume we are playing a guessing game: there are
> > multiple mm_structs available for reclaim, would the oom-killed ones
> > be more profitable on average? I'd say no, because I assume it's more
> > likely than unlikely that the oom reaper is doing/to do its work. Note
> > that the assumption is about likelihood, hence arguably valid.
> 
> Well, my main counter argument would be that we do not really want to
> carve last resort mechanism (which the oom reaper is) into any heuristic
> because any future changes into that mechanism will be much harder to
> justify and change. There is a cost of the maintenance that should be
> considered. While you might be right that this change would be
> beneficial, there is no actual proof of that. Historically we've had
> several examples of such a behavior which was really hard to change
> later on because the effect would be really hard to evaluate.

Forgot to mention the recent change as a clear example of the change
which would be have a higher burden to evaluate. e4a38402c36e
("oom_kill.c: futex: delay the OOM reaper to allow time for proper futex
cleanup") has changed the wake up logic to be triggered after a timeout.
This means that the task will be sitting there on the queue without any
actual reclaim done on it. The timeout itself can be changed in the
future and I would really hate to argue that changeing it from $FOO to
$FOO + epsilon breaks a very subtle dependency somewhere deep in the
reclaim path. From the oom reaper POV any timeout is reasonable becaude
this is the _last_ resort to resolve OOM stall/deadlock when the victim
cannot exit on its own for whatever reason. This is a considerably
different objective from "we want to optimize which taks to scan to
reclaim efficiently".

See my point?
diff mbox series

Patch

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6cdde62b078b..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -77,15 +77,6 @@  static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
-/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
-	return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@  static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
-#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS	27	/* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
  * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
  * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@  static inline int get_dumpable(struct mm_struct *mm)
  * pinned pages were unpinned later on, we'll still keep this bit set for the
  * lifecycle of this mm, just for simplicity.
  */
-#define MMF_HAS_PINNED		28	/* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 98dca2b42357..c6c76c313b39 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -764,10 +764,8 @@  static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
 		mmgrab(tsk->signal->oom_mm);
-		set_bit(MMF_OOM_VICTIM, &mm->flags);
-	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep