diff mbox series

[RFC,v2,3/3] mm, oom: hand over MMF_OOM_SKIP to exit path if it is guranteed to finish

Message ID 20181025082403.3806-4-mhocko@kernel.org (mailing list archive)
State New, archived
Headers show
Series oom: rework oom_reaper vs. exit_mmap handoff | expand

Commit Message

Michal Hocko Oct. 25, 2018, 8:24 a.m. UTC
From: Michal Hocko <mhocko@suse.com>

David Rientjes has noted that certain user space memory allocators leave
a lot of page tables behind and the current implementation of oom_reaper
doesn't deal with those workloads very well. In order to improve these
workloads define a point when exit_mmap is guaranteed to finish the tear
down without any further blocking etc. This is right after we unlink
vmas (those still depend on locks which are held while performing memory
allocations from other contexts) and before we start releasing page
tables.

Opencode free_pgtables and explicitly unlink all vmas first. Then set
mm->mmap to NULL (there shouldn't be anybody looking at it at this
stage) and check for mm->mmap in the oom_reaper path. If the mm->mmap
is NULL we rely on the exit path and won't set MMF_OOM_SKIP from the
reaper.

Changes since RFC
- the task is still visible to the OOM killer after exit_mmap terminates
  so we should set MMF_OOM_SKIP from that path to be sure the oom killer
  doesn't get stuck on this task (see d7a94e7e11badf84 for more context)
  - per Tetsuo
- split free_pgtables into unlinking and actual freeing part. We cannot
  rely on free_pgd_range because of hugetlb pages on ppc resp. sparc
  which do their own tear down

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/internal.h |  3 +++
 mm/memory.c   | 28 ++++++++++++++++++----------
 mm/mmap.c     | 25 +++++++++++++++++++++----
 mm/oom_kill.c | 13 +++++++------
 4 files changed, 49 insertions(+), 20 deletions(-)

Comments

Tetsuo Handa Oct. 30, 2018, 4:45 a.m. UTC | #1
Michal Hocko wrote:
> @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
>                 vma = remove_vma(vma);
>         }
>         vm_unacct_memory(nr_accounted);
> +
> +       /*
> +        * Now that the full address space is torn down, make sure the
> +        * OOM killer skips over this task
> +        */
> +       if (oom)
> +               set_bit(MMF_OOM_SKIP, &mm->flags);
>  }
> 
>  /* Insert vm structure into process list sorted by address

I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
writeback, some of them might be doing GFP_KERNEL allocation from
vma->vm_ops->open() with a lock also held by vma->vm_ops->close().

I don't think that waiting for completion of remove_vma() loop is safe.
And my patch is safe.

 drivers/android/binder.c                          |    2 +-
 drivers/gpu/drm/drm_gem_cma_helper.c              |    2 +-
 drivers/gpu/drm/drm_vm.c                          |    8 ++++----
 drivers/gpu/drm/gma500/framebuffer.c              |    2 +-
 drivers/gpu/drm/gma500/psb_drv.c                  |    2 +-
 drivers/gpu/drm/i915/i915_drv.c                   |    2 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c                   |    2 +-
 drivers/gpu/drm/udl/udl_drv.c                     |    2 +-
 drivers/gpu/drm/v3d/v3d_drv.c                     |    2 +-
 drivers/gpu/drm/vc4/vc4_drv.c                     |    2 +-
 drivers/gpu/drm/vgem/vgem_drv.c                   |    2 +-
 drivers/gpu/drm/vkms/vkms_drv.c                   |    2 +-
 drivers/gpu/drm/xen/xen_drm_front.c               |    2 +-
 drivers/hwtracing/intel_th/msu.c                  |    2 +-
 drivers/hwtracing/stm/core.c                      |    2 +-
 drivers/infiniband/core/uverbs_main.c             |    2 +-
 drivers/infiniband/sw/rdmavt/mmap.c               |    2 +-
 drivers/infiniband/sw/rxe/rxe_mmap.c              |    2 +-
 drivers/media/common/videobuf2/videobuf2-memops.c |    2 +-
 drivers/media/pci/meye/meye.c                     |    2 +-
 drivers/media/platform/omap/omap_vout.c           |    2 +-
 drivers/media/usb/stkwebcam/stk-webcam.c          |    2 +-
 drivers/media/v4l2-core/videobuf-dma-contig.c     |    2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c         |    2 +-
 drivers/media/v4l2-core/videobuf-vmalloc.c        |    2 +-
 drivers/misc/genwqe/card_dev.c                    |    2 +-
 drivers/misc/mic/scif/scif_mmap.c                 |    2 +-
 drivers/misc/sgi-gru/grufile.c                    |    2 +-
 drivers/rapidio/devices/rio_mport_cdev.c          |    2 +-
 drivers/staging/comedi/comedi_fops.c              |    2 +-
 drivers/staging/media/zoran/zoran_driver.c        |    2 +-
 drivers/staging/vme/devices/vme_user.c            |    2 +-
 drivers/usb/core/devio.c                          |    2 +-
 drivers/usb/mon/mon_bin.c                         |    2 +-
 drivers/video/fbdev/omap2/omapfb/omapfb-main.c    |    2 +-
 drivers/xen/gntalloc.c                            |    2 +-
 drivers/xen/gntdev.c                              |    2 +-
 drivers/xen/privcmd-buf.c                         |    2 +-
 drivers/xen/privcmd.c                             |    2 +-
 fs/9p/vfs_file.c                                  |    2 +-
 fs/fuse/file.c                                    |    2 +-
 fs/kernfs/file.c                                  |    2 +-
 include/linux/mm.h                                |    2 +-
 ipc/shm.c                                         |    2 +-
 kernel/events/core.c                              |    2 +-
 kernel/relay.c                                    |    2 +-
 mm/hugetlb.c                                      |    2 +-
 mm/mmap.c                                         |   14 +++++++-------
 net/packet/af_packet.c                            |    2 +-
 sound/core/pcm_native.c                           |    4 ++--
 sound/usb/usx2y/us122l.c                          |    2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c                   |    2 +-
 52 files changed, 62 insertions(+), 62 deletions(-)
Michal Hocko Oct. 30, 2018, 6:31 a.m. UTC | #2
On Tue 30-10-18 13:45:22, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
> >                 vma = remove_vma(vma);
> >         }
> >         vm_unacct_memory(nr_accounted);
> > +
> > +       /*
> > +        * Now that the full address space is torn down, make sure the
> > +        * OOM killer skips over this task
> > +        */
> > +       if (oom)
> > +               set_bit(MMF_OOM_SKIP, &mm->flags);
> >  }
> > 
> >  /* Insert vm structure into process list sorted by address
> 
> I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
> call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
> writeback, some of them might be doing GFP_KERNEL allocation from
> vma->vm_ops->open() with a lock also held by vma->vm_ops->close().
> 
> I don't think that waiting for completion of remove_vma() loop is safe.

What do you mean by 'safe' here?
Tetsuo Handa Oct. 30, 2018, 9:47 a.m. UTC | #3
On 2018/10/30 15:31, Michal Hocko wrote:
> On Tue 30-10-18 13:45:22, Tetsuo Handa wrote:
>> Michal Hocko wrote:
>>> @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
>>>                 vma = remove_vma(vma);
>>>         }
>>>         vm_unacct_memory(nr_accounted);
>>> +
>>> +       /*
>>> +        * Now that the full address space is torn down, make sure the
>>> +        * OOM killer skips over this task
>>> +        */
>>> +       if (oom)
>>> +               set_bit(MMF_OOM_SKIP, &mm->flags);
>>>  }
>>>
>>>  /* Insert vm structure into process list sorted by address
>>
>> I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
>> call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
>> writeback, some of them might be doing GFP_KERNEL allocation from
>> vma->vm_ops->open() with a lock also held by vma->vm_ops->close().
>>
>> I don't think that waiting for completion of remove_vma() loop is safe.
> 
> What do you mean by 'safe' here?
> 

safe = "Does not cause OOM lockup."

remove_vma() is allowed to sleep, and some users might depend on memory
allocation when the OOM killer is waiting for remove_vma() to complete.
Michal Hocko Oct. 30, 2018, 11:39 a.m. UTC | #4
On Tue 30-10-18 18:47:43, Tetsuo Handa wrote:
> On 2018/10/30 15:31, Michal Hocko wrote:
> > On Tue 30-10-18 13:45:22, Tetsuo Handa wrote:
> >> Michal Hocko wrote:
> >>> @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
> >>>                 vma = remove_vma(vma);
> >>>         }
> >>>         vm_unacct_memory(nr_accounted);
> >>> +
> >>> +       /*
> >>> +        * Now that the full address space is torn down, make sure the
> >>> +        * OOM killer skips over this task
> >>> +        */
> >>> +       if (oom)
> >>> +               set_bit(MMF_OOM_SKIP, &mm->flags);
> >>>  }
> >>>
> >>>  /* Insert vm structure into process list sorted by address
> >>
> >> I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
> >> call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
> >> writeback, some of them might be doing GFP_KERNEL allocation from
> >> vma->vm_ops->open() with a lock also held by vma->vm_ops->close().
> >>
> >> I don't think that waiting for completion of remove_vma() loop is safe.
> > 
> > What do you mean by 'safe' here?
> > 
> 
> safe = "Does not cause OOM lockup."
> 
> remove_vma() is allowed to sleep, and some users might depend on memory
> allocation when the OOM killer is waiting for remove_vma() to complete.

But MMF_OOF_SKIP is set after we are done with remove_vma. In fact it is
the very last thing in exit_mmap. So I do not follow what you mean.
Tetsuo Handa Oct. 30, 2018, 12:02 p.m. UTC | #5
On 2018/10/30 20:39, Michal Hocko wrote:
> On Tue 30-10-18 18:47:43, Tetsuo Handa wrote:
>> On 2018/10/30 15:31, Michal Hocko wrote:
>>> On Tue 30-10-18 13:45:22, Tetsuo Handa wrote:
>>>> Michal Hocko wrote:
>>>>> @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
>>>>>                 vma = remove_vma(vma);
>>>>>         }
>>>>>         vm_unacct_memory(nr_accounted);
>>>>> +
>>>>> +       /*
>>>>> +        * Now that the full address space is torn down, make sure the
>>>>> +        * OOM killer skips over this task
>>>>> +        */
>>>>> +       if (oom)
>>>>> +               set_bit(MMF_OOM_SKIP, &mm->flags);
>>>>>  }
>>>>>
>>>>>  /* Insert vm structure into process list sorted by address
>>>>
>>>> I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
>>>> call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
>>>> writeback, some of them might be doing GFP_KERNEL allocation from
>>>> vma->vm_ops->open() with a lock also held by vma->vm_ops->close().
>>>>
>>>> I don't think that waiting for completion of remove_vma() loop is safe.
>>>
>>> What do you mean by 'safe' here?
>>>
>>
>> safe = "Does not cause OOM lockup."
>>
>> remove_vma() is allowed to sleep, and some users might depend on memory
>> allocation when the OOM killer is waiting for remove_vma() to complete.
> 
> But MMF_OOF_SKIP is set after we are done with remove_vma. In fact it is
> the very last thing in exit_mmap. So I do not follow what you mean.
> 

So what? Think the worst case. Quite obvious bug here.

What happens if memory reclaimed by up to __free_pgtables() was consumed by somebody
else, and then some vma->vm_ops->close() started waiting for memory allocation due to
dependency? It is called "OOM lockup" because the OOM killer cannot be enabled because
MMF_OOM_SKIP cannot be set because vma->vm_ops->close() is waiting for the OOM killer
due to memory allocation dependency in vma->vm_ops->close() from remove_vma()...
Michal Hocko Oct. 30, 2018, 12:10 p.m. UTC | #6
On Tue 30-10-18 21:02:40, Tetsuo Handa wrote:
> On 2018/10/30 20:39, Michal Hocko wrote:
> > On Tue 30-10-18 18:47:43, Tetsuo Handa wrote:
> >> On 2018/10/30 15:31, Michal Hocko wrote:
> >>> On Tue 30-10-18 13:45:22, Tetsuo Handa wrote:
> >>>> Michal Hocko wrote:
> >>>>> @@ -3156,6 +3166,13 @@ void exit_mmap(struct mm_struct *mm)
> >>>>>                 vma = remove_vma(vma);
> >>>>>         }
> >>>>>         vm_unacct_memory(nr_accounted);
> >>>>> +
> >>>>> +       /*
> >>>>> +        * Now that the full address space is torn down, make sure the
> >>>>> +        * OOM killer skips over this task
> >>>>> +        */
> >>>>> +       if (oom)
> >>>>> +               set_bit(MMF_OOM_SKIP, &mm->flags);
> >>>>>  }
> >>>>>
> >>>>>  /* Insert vm structure into process list sorted by address
> >>>>
> >>>> I don't like setting MMF_OOF_SKIP after remove_vma() loop. 50 users might
> >>>> call vma->vm_ops->close() from remove_vma(). Some of them are doing fs
> >>>> writeback, some of them might be doing GFP_KERNEL allocation from
> >>>> vma->vm_ops->open() with a lock also held by vma->vm_ops->close().
> >>>>
> >>>> I don't think that waiting for completion of remove_vma() loop is safe.
> >>>
> >>> What do you mean by 'safe' here?
> >>>
> >>
> >> safe = "Does not cause OOM lockup."
> >>
> >> remove_vma() is allowed to sleep, and some users might depend on memory
> >> allocation when the OOM killer is waiting for remove_vma() to complete.
> > 
> > But MMF_OOF_SKIP is set after we are done with remove_vma. In fact it is
> > the very last thing in exit_mmap. So I do not follow what you mean.
> > 
> 
> So what? Think the worst case. Quite obvious bug here.

I misunderstood your concern. oom_reaper would back off without
MMF_OOF_SKIP as well. You are right we cannot assume anything about
close callbacks so MMF_OOM_SKIP has to come before that. I will move it
behind the pagetable freeing.
Tetsuo Handa Oct. 30, 2018, 1:57 p.m. UTC | #7
On 2018/10/30 21:10, Michal Hocko wrote:
> I misunderstood your concern. oom_reaper would back off without
> MMF_OOF_SKIP as well. You are right we cannot assume anything about
> close callbacks so MMF_OOM_SKIP has to come before that. I will move it
> behind the pagetable freeing.
> 

And at that point, your patch can at best wait for only __free_pgtables(),
at the cost/risk of complicating exit_mmap() and arch specific code. Also,
you are asking for comments to wrong audiences. It is arch maintainers who
need to precisely understand the OOM behavior / possibility of OOM lockup,
and you must persuade them about restricting/complicating future changes in
their arch code due to your wish to allow handover. Without "up-to-dated
big fat comments to all relevant functions affected by your change" and
"acks from all arch maintainers", I'm sure that people keep making
errors/mistakes/overlooks.

My patch can wait for completion of (not only exit_mmap() but also) __mmput(),
by using simple polling approach. My patch can allow NOMMU kernels to avoid
possibility of OOM lockup by setting MMF_OOM_SKIP at __mmput() (and future
patch will implement timeout based back off for NOMMU kernels), and allows you
to get rid of TIF_MEMDIE (which you recently added to your TODO list) by getting
rid of conditional handling of oom_reserves_allowed() and ALLOC_OOM.

Your "refusing timeout based next OOM victim selection" keeps everyone unable
to safely make forward progress. OOM handling is too much complicated, and
nobody can become free from errors/mistakes/overlooks. Look at the reality!
Michal Hocko Oct. 30, 2018, 2:23 p.m. UTC | #8
On Tue 30-10-18 22:57:37, Tetsuo Handa wrote:
> On 2018/10/30 21:10, Michal Hocko wrote:
> > I misunderstood your concern. oom_reaper would back off without
> > MMF_OOF_SKIP as well. You are right we cannot assume anything about
> > close callbacks so MMF_OOM_SKIP has to come before that. I will move it
> > behind the pagetable freeing.
> > 
> 
> And at that point, your patch can at best wait for only __free_pgtables(),

Yes, mostly on the grounds that oom victims are mostly sitting on mapped
memory and page tables. I can see how last ->close() can release some
memory as well but a) we do not consider that memory when selecting a
victim and b) it shouldn't be a large memory consumer on its own.

> at the cost/risk of complicating exit_mmap() and arch specific code.Also,
> you are asking for comments to wrong audiences. It is arch maintainers who
> need to precisely understand the OOM behavior / possibility of OOM lockup,
> and you must persuade them about restricting/complicating future changes in
> their arch code due to your wish to allow handover. Without "up-to-dated
> big fat comments to all relevant functions affected by your change" and
> "acks from all arch maintainers", I'm sure that people keep making
> errors/mistakes/overlooks.

Are you talking about arch_exit_mmap or which part of the arch code?

> My patch can wait for completion of (not only exit_mmap() but also) __mmput(),
> by using simple polling approach. My patch can allow NOMMU kernels to avoid
> possibility of OOM lockup by setting MMF_OOM_SKIP at __mmput() (and future
> patch will implement timeout based back off for NOMMU kernels), and allows you
> to get rid of TIF_MEMDIE (which you recently added to your TODO list) by getting
> rid of conditional handling of oom_reserves_allowed() and ALLOC_OOM.

OK, let's settle on a simple fact. I would like to discuss _this_
approach here. Bringing up _yours_ all the time is not productive much.
You might have noticed that I have posted this for discussion (hence the
RGC) and as such I would appreciate staying on the topic.

What is the best approach in the end is a matter of discsussion of
course. At thise stage it is quite clear we can only agree to disagree
which approach is better and discussing the same set of points back and
forth is not going to get us anywhere. Therefore we would have to rely on the
maintainer to decide.
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index 87256ae1bef8..35adbfec4935 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,9 @@  void page_writeback_init(void);
 
 vm_fault_t do_swap_page(struct vm_fault *vmf);
 
+void __unlink_vmas(struct vm_area_struct *vma);
+void __free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+		unsigned long floor, unsigned long ceiling);
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/memory.c b/mm/memory.c
index c467102a5cbc..cf910ed5f283 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -612,20 +612,23 @@  void free_pgd_range(struct mmu_gather *tlb,
 	} while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+void __unlink_vmas(struct vm_area_struct *vma)
+{
+	while (vma) {
+		unlink_anon_vmas(vma);
+		unlink_file_vma(vma);
+		vma = vma->vm_next;
+	}
+}
+
+/* expects that __unlink_vmas has been called already */
+void __free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
-		/*
-		 * Hide vma from rmap and truncate_pagecache before freeing
-		 * pgtables
-		 */
-		unlink_anon_vmas(vma);
-		unlink_file_vma(vma);
-
 		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
@@ -637,8 +640,6 @@  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
-				unlink_anon_vmas(vma);
-				unlink_file_vma(vma);
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
@@ -647,6 +648,13 @@  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	}
 }
 
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		unsigned long floor, unsigned long ceiling)
+{
+	__unlink_vmas(vma);
+	__free_pgtables(tlb, vma, floor, ceiling);
+}
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	spinlock_t *ptl;
diff --git a/mm/mmap.c b/mm/mmap.c
index a02b314c0546..f4b562e21764 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3082,13 +3082,26 @@  void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
-	/* oom_reaper cannot race with the page tables teardown */
+	/*
+	 * oom_reaper cannot race with the page tables teardown but we
+	 * want to make sure that the exit path can take over the full
+	 * tear down when it is safe to do so
+	 */
 	if (oom) {
 		down_write(&mm->mmap_sem);
-		set_bit(MMF_OOM_SKIP, &mm->flags);
+		__unlink_vmas(vma);
+		/*
+		 * the exit path is guaranteed to finish the memory tear down
+		 * without any unbound blocking at this stage so make it clear
+		 * to the oom_reaper
+		 */
+		mm->mmap = NULL;
+		up_write(&mm->mmap_sem);
+		__free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+	} else {
+		free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	}
 
-	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 
 	/*
@@ -3102,8 +3115,12 @@  void exit_mmap(struct mm_struct *mm)
 	}
 	vm_unacct_memory(nr_accounted);
 
+	/*
+	 * Now that the full address space is torn down, make sure the
+	 * OOM killer skips over this task
+	 */
 	if (oom)
-		up_write(&mm->mmap_sem);
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ab42717661dc..db1ebb45c66a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -570,12 +570,10 @@  static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm, unsi
 	}
 
 	/*
-	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-	 * under mmap_sem for reading because it serializes against the
-	 * down_write() in exit_mmap().
+	 * If exit path clear mm->mmap then we know it will finish the tear down
+	 * and we can go and bail out here.
 	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+	if (!mm->mmap) {
 		trace_skip_task_reaping(tsk->pid);
 		goto out_unlock;
 	}
@@ -625,8 +623,11 @@  static void oom_reap_task(struct task_struct *tsk)
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
+	 * Leave the MMF_OOM_SKIP to the exit path if it managed to reach the
+	 * point it is guaranteed to finish without any blocking
 	 */
-	set_bit(MMF_OOM_SKIP, &mm->flags);
+	if (mm->mmap)
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);