diff mbox series

[1/7] mm/thp: fix __split_huge_pmd_locked() on shmem migration entry

Message ID alpine.LSU.2.11.2106011403540.2148@eggly.anvils (mailing list archive)
State New, archived
Headers show
Series mm/thp: fix THP splitting unmap BUGs and related | expand

Commit Message

Hugh Dickins June 1, 2021, 9:05 p.m. UTC
Stressing huge tmpfs page migration racing hole punch often crashed on the
VM_BUG_ON(!pmd_present) in pmdp_huge_clear_flush(), with DEBUG_VM=y kernel;
or shortly afterwards, on a bad dereference in __split_huge_pmd_locked()
when DEBUG_VM=n.  They forgot to allow for pmd migration entries in the
non-anonymous case.

Full disclosure: those particular experiments were on a kernel with more
relaxed mmap_lock and i_mmap_rwsem locking, and were not repeated on the
vanilla kernel: it is conceivable that stricter locking happens to avoid
those cases, or makes them less likely; but __split_huge_pmd_locked()
already allowed for pmd migration entries when handling anonymous THPs,
so this commit brings the shmem and file THP handling into line.

Are there more places that need to be careful about pmd migration entries?
None hit in practice, but several of those is_huge_zero_pmd() tests were
done without checking pmd_present() first: I believe a pmd migration entry
could end up satisfying that test.  Ah, the inversion of swap offset, to
protect against L1TF, makes that impossible on x86; but other arches need
the pmd_present() check, and even x86 ought not to apply pmd_page() to a
swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
wrong to be checking with pmd_trans_huge() instead, but I think it's
clearer to use pmd_present() in each instance.

And while there: make it clearer to the eye that the !vma_is_anonymous()
and is_huge_zero_pmd() blocks make early returns (and don't return void).

Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
---
 mm/huge_memory.c     | 38 ++++++++++++++++++++++++--------------
 mm/pgtable-generic.c |  5 ++---
 2 files changed, 26 insertions(+), 17 deletions(-)

Comments

Yang Shi June 3, 2021, 9:26 p.m. UTC | #1
On Tue, Jun 1, 2021 at 2:05 PM Hugh Dickins <hughd@google.com> wrote:
>
> Stressing huge tmpfs page migration racing hole punch often crashed on the
> VM_BUG_ON(!pmd_present) in pmdp_huge_clear_flush(), with DEBUG_VM=y kernel;
> or shortly afterwards, on a bad dereference in __split_huge_pmd_locked()
> when DEBUG_VM=n.  They forgot to allow for pmd migration entries in the
> non-anonymous case.
>
> Full disclosure: those particular experiments were on a kernel with more
> relaxed mmap_lock and i_mmap_rwsem locking, and were not repeated on the
> vanilla kernel: it is conceivable that stricter locking happens to avoid
> those cases, or makes them less likely; but __split_huge_pmd_locked()
> already allowed for pmd migration entries when handling anonymous THPs,
> so this commit brings the shmem and file THP handling into line.
>
> Are there more places that need to be careful about pmd migration entries?
> None hit in practice, but several of those is_huge_zero_pmd() tests were
> done without checking pmd_present() first: I believe a pmd migration entry
> could end up satisfying that test.  Ah, the inversion of swap offset, to
> protect against L1TF, makes that impossible on x86; but other arches need
> the pmd_present() check, and even x86 ought not to apply pmd_page() to a
> swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
> wrong to be checking with pmd_trans_huge() instead, but I think it's
> clearer to use pmd_present() in each instance.
>
> And while there: make it clearer to the eye that the !vma_is_anonymous()
> and is_huge_zero_pmd() blocks make early returns (and don't return void).
>
> Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> Cc: <stable@vger.kernel.org>
> ---
>  mm/huge_memory.c     | 38 ++++++++++++++++++++++++--------------
>  mm/pgtable-generic.c |  5 ++---
>  2 files changed, 26 insertions(+), 17 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 63ed6b25deaa..9fb7b47da87e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1587,9 +1587,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>                 goto out_unlocked;
>
>         orig_pmd = *pmd;
> -       if (is_huge_zero_pmd(orig_pmd))
> -               goto out;
> -
>         if (unlikely(!pmd_present(orig_pmd))) {
>                 VM_BUG_ON(thp_migration_supported() &&
>                                   !is_pmd_migration_entry(orig_pmd));
> @@ -1597,6 +1594,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>         }
>
>         page = pmd_page(orig_pmd);
> +       if (is_huge_zero_page(page))
> +               goto out;
> +
>         /*
>          * If other processes are mapping this page, we couldn't discard
>          * the page unless they all do MADV_FREE so let's skip the page.
> @@ -1676,7 +1676,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>                 spin_unlock(ptl);
>                 if (is_huge_zero_pmd(orig_pmd))
>                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
> -       } else if (is_huge_zero_pmd(orig_pmd)) {
> +       } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {

If it is a huge zero migration entry, the code would fallback to the
"else". But IIUC the "else" case doesn't handle the huge zero page
correctly. It may mess up the rss counter.

>                 zap_deposited_table(tlb->mm, pmd);
>                 spin_unlock(ptl);
>                 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
> @@ -2044,7 +2044,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>         count_vm_event(THP_SPLIT_PMD);
>
>         if (!vma_is_anonymous(vma)) {
> -               _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
> +               old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>                 /*
>                  * We are going to unmap this huge page. So
>                  * just go ahead and zap it
> @@ -2053,16 +2053,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>                         zap_deposited_table(mm, pmd);
>                 if (vma_is_special_huge(vma))
>                         return;
> -               page = pmd_page(_pmd);
> -               if (!PageDirty(page) && pmd_dirty(_pmd))
> -                       set_page_dirty(page);
> -               if (!PageReferenced(page) && pmd_young(_pmd))
> -                       SetPageReferenced(page);
> -               page_remove_rmap(page, true);
> -               put_page(page);
> +               if (unlikely(is_pmd_migration_entry(old_pmd))) {
> +                       swp_entry_t entry;
> +
> +                       entry = pmd_to_swp_entry(old_pmd);
> +                       page = migration_entry_to_page(entry);
> +               } else {
> +                       page = pmd_page(old_pmd);
> +                       if (!PageDirty(page) && pmd_dirty(old_pmd))
> +                               set_page_dirty(page);
> +                       if (!PageReferenced(page) && pmd_young(old_pmd))
> +                               SetPageReferenced(page);
> +                       page_remove_rmap(page, true);
> +                       put_page(page);
> +               }
>                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
>                 return;
> -       } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
> +       }
> +
> +       if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
>                 /*
>                  * FIXME: Do we want to invalidate secondary mmu by calling
>                  * mmu_notifier_invalidate_range() see comments below inside
> @@ -2072,7 +2081,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>                  * small page also write protected so it does not seems useful
>                  * to invalidate secondary mmu at this time.
>                  */
> -               return __split_huge_zero_page_pmd(vma, haddr, pmd);
> +               __split_huge_zero_page_pmd(vma, haddr, pmd);
> +               return;
>         }
>
>         /*
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index c2210e1cdb51..4e640baf9794 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
>  {
>         pmd_t pmd;
>         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -       VM_BUG_ON(!pmd_present(*pmdp));
> -       /* Below assumes pmd_present() is true */
> -       VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> +       VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
> +                          !pmd_devmap(*pmdp));
>         pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
>         flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>         return pmd;
> --
> 2.32.0.rc0.204.g9fa02ecfa5-goog
>
Hugh Dickins June 4, 2021, 2:22 a.m. UTC | #2
On Thu, 3 Jun 2021, Yang Shi wrote:
> On Tue, Jun 1, 2021 at 2:05 PM Hugh Dickins <hughd@google.com> wrote:
> >
> > Are there more places that need to be careful about pmd migration entries?
> > None hit in practice, but several of those is_huge_zero_pmd() tests were
> > done without checking pmd_present() first: I believe a pmd migration entry
> > could end up satisfying that test.  Ah, the inversion of swap offset, to
> > protect against L1TF, makes that impossible on x86; but other arches need
> > the pmd_present() check, and even x86 ought not to apply pmd_page() to a
> > swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
> > wrong to be checking with pmd_trans_huge() instead, but I think it's
> > clearer to use pmd_present() in each instance.
...
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 63ed6b25deaa..9fb7b47da87e 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1676,7 +1676,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >                 spin_unlock(ptl);
> >                 if (is_huge_zero_pmd(orig_pmd))
> >                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
> > -       } else if (is_huge_zero_pmd(orig_pmd)) {
> > +       } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
> 
> If it is a huge zero migration entry, the code would fallback to the
> "else". But IIUC the "else" case doesn't handle the huge zero page
> correctly. It may mess up the rss counter.

A huge zero migration entry?  I hope that's not something special
that I've missed.

Do we ever migrate a huge zero page - and how do we find where it's
mapped, to insert the migration entries?  But if we do, I thought it
would use the usual kind of pmd migration entry; and the first check
in is_pmd_migration_entry() is !pmd_present(pmd).

(I have to be rather careful to check such details, after getting
burnt once by pmd_present(): which includes the "huge" bit even when
not otherwise present, to permit races with pmdp_invalidate().
I mentioned in private mail that I'd dropped one of my "fixes" because
it was harmless but mistaken: I had misunderstood pmd_present().)

The point here (see commit message above) is that some unrelated pmd
migration entry could pass the is_huge_zero_pmd() test, which rushes
off to use pmd_page() without even checking pmd_present() first.  And
most of its users have, one way or another, checked pmd_present() first;
but this place and a couple of others had not.

I'm just verifying that it's really a a huge zero pmd before handling
its case; the "else" still does not need to handle the huge zero page.

Hugh
Kirill A . Shutemov June 4, 2021, 3:34 p.m. UTC | #3
On Tue, Jun 01, 2021 at 02:05:45PM -0700, Hugh Dickins wrote:
> Stressing huge tmpfs page migration racing hole punch often crashed on the
> VM_BUG_ON(!pmd_present) in pmdp_huge_clear_flush(), with DEBUG_VM=y kernel;
> or shortly afterwards, on a bad dereference in __split_huge_pmd_locked()
> when DEBUG_VM=n.  They forgot to allow for pmd migration entries in the
> non-anonymous case.
> 
> Full disclosure: those particular experiments were on a kernel with more
> relaxed mmap_lock and i_mmap_rwsem locking, and were not repeated on the
> vanilla kernel: it is conceivable that stricter locking happens to avoid
> those cases, or makes them less likely; but __split_huge_pmd_locked()
> already allowed for pmd migration entries when handling anonymous THPs,
> so this commit brings the shmem and file THP handling into line.
> 
> Are there more places that need to be careful about pmd migration entries?
> None hit in practice, but several of those is_huge_zero_pmd() tests were
> done without checking pmd_present() first: I believe a pmd migration entry
> could end up satisfying that test.  Ah, the inversion of swap offset, to
> protect against L1TF, makes that impossible on x86; but other arches need
> the pmd_present() check, and even x86 ought not to apply pmd_page() to a
> swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
> wrong to be checking with pmd_trans_huge() instead, but I think it's
> clearer to use pmd_present() in each instance.
> 
> And while there: make it clearer to the eye that the !vma_is_anonymous()
> and is_huge_zero_pmd() blocks make early returns (and don't return void).
> 
> Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> Cc: <stable@vger.kernel.org>

Looks like a two fixes got squashed into one patch. Zero-page fix and
migration entries in __split_huge_pmd_locked() deserve separate patches,
no?

Maybe add VM_BUG_ON(!pmd_present()) in is_huge_zero_pmd()?

Also I wounder how much code we can remove if we would not establish
migration ptes for file pages. We can make these page table entries 'none'
on migration.
Yang Shi June 4, 2021, 6:03 p.m. UTC | #4
On Thu, Jun 3, 2021 at 7:23 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Thu, 3 Jun 2021, Yang Shi wrote:
> > On Tue, Jun 1, 2021 at 2:05 PM Hugh Dickins <hughd@google.com> wrote:
> > >
> > > Are there more places that need to be careful about pmd migration entries?
> > > None hit in practice, but several of those is_huge_zero_pmd() tests were
> > > done without checking pmd_present() first: I believe a pmd migration entry
> > > could end up satisfying that test.  Ah, the inversion of swap offset, to
> > > protect against L1TF, makes that impossible on x86; but other arches need
> > > the pmd_present() check, and even x86 ought not to apply pmd_page() to a
> > > swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
> > > wrong to be checking with pmd_trans_huge() instead, but I think it's
> > > clearer to use pmd_present() in each instance.
> ...
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 63ed6b25deaa..9fb7b47da87e 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1676,7 +1676,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > >                 spin_unlock(ptl);
> > >                 if (is_huge_zero_pmd(orig_pmd))
> > >                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
> > > -       } else if (is_huge_zero_pmd(orig_pmd)) {
> > > +       } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
> >
> > If it is a huge zero migration entry, the code would fallback to the
> > "else". But IIUC the "else" case doesn't handle the huge zero page
> > correctly. It may mess up the rss counter.
>
> A huge zero migration entry?  I hope that's not something special
> that I've missed.
>
> Do we ever migrate a huge zero page - and how do we find where it's
> mapped, to insert the migration entries?  But if we do, I thought it
> would use the usual kind of pmd migration entry; and the first check
> in is_pmd_migration_entry() is !pmd_present(pmd).

I overlooked if the huge zero page is migratable or not when I was
writing the comment, just focused on the if/else if/else conditions.

I don't think huge zero page is migratable by a quick look since:
* mempolicy and numa hinting skip huge zero pmd
* other migration callsites just try to migrate LRU pages

>
> (I have to be rather careful to check such details, after getting
> burnt once by pmd_present(): which includes the "huge" bit even when
> not otherwise present, to permit races with pmdp_invalidate().
> I mentioned in private mail that I'd dropped one of my "fixes" because
> it was harmless but mistaken: I had misunderstood pmd_present().)
>
> The point here (see commit message above) is that some unrelated pmd
> migration entry could pass the is_huge_zero_pmd() test, which rushes
> off to use pmd_page() without even checking pmd_present() first.  And
> most of its users have, one way or another, checked pmd_present() first;
> but this place and a couple of others had not.

Thanks for the elaboration. Wondering whether we'd better add some
comments in the code? Someone may submit a fix patch by visual
inspection in the future due to missing these points.

>
> I'm just verifying that it's really a a huge zero pmd before handling
> its case; the "else" still does not need to handle the huge zero page.
>
> Hugh
Hugh Dickins June 4, 2021, 9:29 p.m. UTC | #5
On Fri, 4 Jun 2021, Kirill A. Shutemov wrote:
> On Tue, Jun 01, 2021 at 02:05:45PM -0700, Hugh Dickins wrote:
> > Stressing huge tmpfs page migration racing hole punch often crashed on the
> > VM_BUG_ON(!pmd_present) in pmdp_huge_clear_flush(), with DEBUG_VM=y kernel;
> > or shortly afterwards, on a bad dereference in __split_huge_pmd_locked()
> > when DEBUG_VM=n.  They forgot to allow for pmd migration entries in the
> > non-anonymous case.
> > 
> > Full disclosure: those particular experiments were on a kernel with more
> > relaxed mmap_lock and i_mmap_rwsem locking, and were not repeated on the
> > vanilla kernel: it is conceivable that stricter locking happens to avoid
> > those cases, or makes them less likely; but __split_huge_pmd_locked()
> > already allowed for pmd migration entries when handling anonymous THPs,
> > so this commit brings the shmem and file THP handling into line.
> > 
> > Are there more places that need to be careful about pmd migration entries?
> > None hit in practice, but several of those is_huge_zero_pmd() tests were
> > done without checking pmd_present() first: I believe a pmd migration entry
> > could end up satisfying that test.  Ah, the inversion of swap offset, to
> > protect against L1TF, makes that impossible on x86; but other arches need
> > the pmd_present() check, and even x86 ought not to apply pmd_page() to a
> > swap-like pmd.  Fix those instances; __split_huge_pmd_locked() was not
> > wrong to be checking with pmd_trans_huge() instead, but I think it's
> > clearer to use pmd_present() in each instance.
> > 
> > And while there: make it clearer to the eye that the !vma_is_anonymous()
> > and is_huge_zero_pmd() blocks make early returns (and don't return void).
> > 
> > Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
> > Signed-off-by: Hugh Dickins <hughd@google.com>
> > Cc: <stable@vger.kernel.org>
> 
> Looks like a two fixes got squashed into one patch. Zero-page fix and
> migration entries in __split_huge_pmd_locked() deserve separate patches,
> no?

Okay, I'll divide in two (and probably lose the "don't return void"
cleanup; but still keep the clearer separation of those two blocks).

> 
> Maybe add VM_BUG_ON(!pmd_present()) in is_huge_zero_pmd()?

Certainly not as part of any patch I'm aiming at stable!  But
I've remembered another approach, I'll say in response to Yang.

> 
> Also I wounder how much code we can remove if we would not establish
> migration ptes for file pages. We can make these page table entries 'none'
> on migration.

I'm not sure how far you're wondering to go with that (just in THP
case, or file ptes generally?). But you may recall that I disagree,
especially on mlocked vmas, where we break the contract by not using
migration entries.  Anyway, not something to get into here.

Thanks a lot for all your reviews, I'll get on with it.

Hugh
Hugh Dickins June 4, 2021, 9:52 p.m. UTC | #6
On Fri, 4 Jun 2021, Yang Shi wrote:
> On Thu, Jun 3, 2021 at 7:23 PM Hugh Dickins <hughd@google.com> wrote:
> > On Thu, 3 Jun 2021, Yang Shi wrote:
> > > On Tue, Jun 1, 2021 at 2:05 PM Hugh Dickins <hughd@google.com> wrote:
> >
> > The point here (see commit message above) is that some unrelated pmd
> > migration entry could pass the is_huge_zero_pmd() test, which rushes
> > off to use pmd_page() without even checking pmd_present() first.  And
> > most of its users have, one way or another, checked pmd_present() first;
> > but this place and a couple of others had not.
> 
> Thanks for the elaboration. Wondering whether we'd better add some
> comments in the code? Someone may submit a fix patch by visual
> inspection in the future due to missing these points.

I don't really want to add a comment on this, there in zap_huge_pmd():
I think it would be too much of a distraction from that dense code
sequence.  And the comment will be more obvious in the commit message,
once I split these is_huge_zero_pmd() fixes off from
__split_huge_pmd_locked() as Kirill asked.

But... now I think I'll scrap these parts of the patch, and instead
just add a pmd_present() check into is_huge_zero_pmd() itself.
pmd_present() is quick, but pmd_page() may not be: I may convert it
to use a __read_only huge_pmd_pfn, or may not: I'll see how that goes.

Hugh
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 63ed6b25deaa..9fb7b47da87e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1587,9 +1587,6 @@  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		goto out_unlocked;
 
 	orig_pmd = *pmd;
-	if (is_huge_zero_pmd(orig_pmd))
-		goto out;
-
 	if (unlikely(!pmd_present(orig_pmd))) {
 		VM_BUG_ON(thp_migration_supported() &&
 				  !is_pmd_migration_entry(orig_pmd));
@@ -1597,6 +1594,9 @@  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	}
 
 	page = pmd_page(orig_pmd);
+	if (is_huge_zero_page(page))
+		goto out;
+
 	/*
 	 * If other processes are mapping this page, we couldn't discard
 	 * the page unless they all do MADV_FREE so let's skip the page.
@@ -1676,7 +1676,7 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		spin_unlock(ptl);
 		if (is_huge_zero_pmd(orig_pmd))
 			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
-	} else if (is_huge_zero_pmd(orig_pmd)) {
+	} else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
 		zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
@@ -2044,7 +2044,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	count_vm_event(THP_SPLIT_PMD);
 
 	if (!vma_is_anonymous(vma)) {
-		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+		old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		/*
 		 * We are going to unmap this huge page. So
 		 * just go ahead and zap it
@@ -2053,16 +2053,25 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			zap_deposited_table(mm, pmd);
 		if (vma_is_special_huge(vma))
 			return;
-		page = pmd_page(_pmd);
-		if (!PageDirty(page) && pmd_dirty(_pmd))
-			set_page_dirty(page);
-		if (!PageReferenced(page) && pmd_young(_pmd))
-			SetPageReferenced(page);
-		page_remove_rmap(page, true);
-		put_page(page);
+		if (unlikely(is_pmd_migration_entry(old_pmd))) {
+			swp_entry_t entry;
+
+			entry = pmd_to_swp_entry(old_pmd);
+			page = migration_entry_to_page(entry);
+		} else {
+			page = pmd_page(old_pmd);
+			if (!PageDirty(page) && pmd_dirty(old_pmd))
+				set_page_dirty(page);
+			if (!PageReferenced(page) && pmd_young(old_pmd))
+				SetPageReferenced(page);
+			page_remove_rmap(page, true);
+			put_page(page);
+		}
 		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
 		return;
-	} else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+	}
+
+	if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
 		/*
 		 * FIXME: Do we want to invalidate secondary mmu by calling
 		 * mmu_notifier_invalidate_range() see comments below inside
@@ -2072,7 +2081,8 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 * small page also write protected so it does not seems useful
 		 * to invalidate secondary mmu at this time.
 		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
+		__split_huge_zero_page_pmd(vma, haddr, pmd);
+		return;
 	}
 
 	/*
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c2210e1cdb51..4e640baf9794 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,9 +135,8 @@  pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
 	pmd_t pmd;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	VM_BUG_ON(!pmd_present(*pmdp));
-	/* Below assumes pmd_present() is true */
-	VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+	VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+			   !pmd_devmap(*pmdp));
 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;