diff mbox series

[v6,02/15] mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds THP

Message ID 20220604004004.954674-3-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe June 4, 2022, 12:39 a.m. UTC
When scanning an anon pmd to see if it's eligible for collapse, return
SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
file-collapse path, since the latter might identify pte-mapped compound
pages.  This is required by MADV_COLLAPSE which necessarily needs to
know what hugepage-aligned/sized regions are already pmd-mapped.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 include/trace/events/huge_memory.h |  1 +
 mm/internal.h                      |  1 +
 mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
 mm/rmap.c                          | 15 ++++++++++++--
 4 files changed, 43 insertions(+), 6 deletions(-)

Comments

Yang Shi June 6, 2022, 8:45 p.m. UTC | #1
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> When scanning an anon pmd to see if it's eligible for collapse, return
> SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
> SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
> file-collapse path, since the latter might identify pte-mapped compound
> pages.  This is required by MADV_COLLAPSE which necessarily needs to
> know what hugepage-aligned/sized regions are already pmd-mapped.
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> ---
>  include/trace/events/huge_memory.h |  1 +
>  mm/internal.h                      |  1 +
>  mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
>  mm/rmap.c                          | 15 ++++++++++++--
>  4 files changed, 43 insertions(+), 6 deletions(-)
>
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index d651f3437367..55392bf30a03 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -11,6 +11,7 @@
>         EM( SCAN_FAIL,                  "failed")                       \
>         EM( SCAN_SUCCEED,               "succeeded")                    \
>         EM( SCAN_PMD_NULL,              "pmd_null")                     \
> +       EM( SCAN_PMD_MAPPED,            "page_pmd_mapped")              \
>         EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
>         EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
>         EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
> diff --git a/mm/internal.h b/mm/internal.h
> index 6e14749ad1e5..f768c7fae668 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -188,6 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
>  /*
>   * in mm/rmap.c:
>   */
> +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
>  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
>
>  /*
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cc3d6fb446d5..7a914ca19e96 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -28,6 +28,7 @@ enum scan_result {
>         SCAN_FAIL,
>         SCAN_SUCCEED,
>         SCAN_PMD_NULL,
> +       SCAN_PMD_MAPPED,
>         SCAN_EXCEED_NONE_PTE,
>         SCAN_EXCEED_SWAP_PTE,
>         SCAN_EXCEED_SHARED_PTE,
> @@ -901,6 +902,31 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
>         return 0;
>  }
>
> +static int find_pmd_or_thp_or_none(struct mm_struct *mm,
> +                                  unsigned long address,
> +                                  pmd_t **pmd)
> +{
> +       pmd_t pmde;
> +
> +       *pmd = mm_find_pmd_raw(mm, address);
> +       if (!*pmd)
> +               return SCAN_PMD_NULL;
> +
> +       pmde = pmd_read_atomic(*pmd);
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
> +       barrier();
> +#endif
> +       if (!pmd_present(pmde))
> +               return SCAN_PMD_NULL;
> +       if (pmd_trans_huge(pmde))
> +               return SCAN_PMD_MAPPED;
> +       if (pmd_bad(pmde))
> +               return SCAN_FAIL;

khugepaged doesn't handle pmd_bad before, IIRC it may just return
SCAN_SUCCEED if everything else is good? It is fine to add it, but it
may be better to return SCAN_PMD_NULL?


> +       return SCAN_SUCCEED;
> +}
> +
>  /*
>   * Bring missing pages in from swap, to complete THP collapse.
>   * Only done if khugepaged_scan_pmd believes it is worthwhile.
> @@ -1146,11 +1172,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>
>         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>
> -       pmd = mm_find_pmd(mm, address);
> -       if (!pmd) {
> -               result = SCAN_PMD_NULL;
> +       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> +       if (result != SCAN_SUCCEED)

There are a couple of other callsites for mm_find_pmd(), you may need
to change all of them to find_pmd_or_thp_or_none() for MADV_COLLAPSE
since khugepaged may collapse the area before MADV_COLLAPSE
reacquiring mmap_lock IIUC and MADV_COLLAPSE does care this case. It
is fine w/o MADV_COLLAPSE since khupaged doesn't care if it is PMD
mapped or not.

So it may be better to move this patch right before MADV_COLLAPSE is introduced?

>                 goto out;
> -       }
>
>         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
>         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 04fac1af870b..c9979c6ad7a1 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -767,13 +767,12 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
>         return vma_address(page, vma);
>  }
>
> -pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)

May be better to have some notes for mm_find_pmd_raw() and mm_find_pmd().

>  {
>         pgd_t *pgd;
>         p4d_t *p4d;
>         pud_t *pud;
>         pmd_t *pmd = NULL;
> -       pmd_t pmde;
>
>         pgd = pgd_offset(mm, address);
>         if (!pgd_present(*pgd))
> @@ -788,6 +787,18 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
>                 goto out;
>
>         pmd = pmd_offset(pud, address);
> +out:
> +       return pmd;
> +}
> +
> +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> +{
> +       pmd_t pmde;
> +       pmd_t *pmd;
> +
> +       pmd = mm_find_pmd_raw(mm, address);
> +       if (!pmd)
> +               goto out;
>         /*
>          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
>          * without holding anon_vma lock for write.  So when looking for a
> --
> 2.36.1.255.ge46751e96f-goog
>
Zach O'Keefe June 7, 2022, 4:01 p.m. UTC | #2
On Mon, Jun 6, 2022 at 1:46 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > When scanning an anon pmd to see if it's eligible for collapse, return
> > SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
> > SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
> > file-collapse path, since the latter might identify pte-mapped compound
> > pages.  This is required by MADV_COLLAPSE which necessarily needs to
> > know what hugepage-aligned/sized regions are already pmd-mapped.
> >
> > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > ---
> >  include/trace/events/huge_memory.h |  1 +
> >  mm/internal.h                      |  1 +
> >  mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
> >  mm/rmap.c                          | 15 ++++++++++++--
> >  4 files changed, 43 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > index d651f3437367..55392bf30a03 100644
> > --- a/include/trace/events/huge_memory.h
> > +++ b/include/trace/events/huge_memory.h
> > @@ -11,6 +11,7 @@
> >         EM( SCAN_FAIL,                  "failed")                       \
> >         EM( SCAN_SUCCEED,               "succeeded")                    \
> >         EM( SCAN_PMD_NULL,              "pmd_null")                     \
> > +       EM( SCAN_PMD_MAPPED,            "page_pmd_mapped")              \
> >         EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
> >         EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
> >         EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 6e14749ad1e5..f768c7fae668 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -188,6 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
> >  /*
> >   * in mm/rmap.c:
> >   */
> > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
> >  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
> >
> >  /*
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index cc3d6fb446d5..7a914ca19e96 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -28,6 +28,7 @@ enum scan_result {
> >         SCAN_FAIL,
> >         SCAN_SUCCEED,
> >         SCAN_PMD_NULL,
> > +       SCAN_PMD_MAPPED,
> >         SCAN_EXCEED_NONE_PTE,
> >         SCAN_EXCEED_SWAP_PTE,
> >         SCAN_EXCEED_SHARED_PTE,
> > @@ -901,6 +902,31 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> >         return 0;
> >  }
> >
> > +static int find_pmd_or_thp_or_none(struct mm_struct *mm,
> > +                                  unsigned long address,
> > +                                  pmd_t **pmd)
> > +{
> > +       pmd_t pmde;
> > +
> > +       *pmd = mm_find_pmd_raw(mm, address);
> > +       if (!*pmd)
> > +               return SCAN_PMD_NULL;
> > +
> > +       pmde = pmd_read_atomic(*pmd);
> > +
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
> > +       barrier();
> > +#endif
> > +       if (!pmd_present(pmde))
> > +               return SCAN_PMD_NULL;
> > +       if (pmd_trans_huge(pmde))
> > +               return SCAN_PMD_MAPPED;
> > +       if (pmd_bad(pmde))
> > +               return SCAN_FAIL;
>
> khugepaged doesn't handle pmd_bad before, IIRC it may just return
> SCAN_SUCCEED if everything else is good? It is fine to add it, but it
> may be better to return SCAN_PMD_NULL?

Correct, pmd_bad() wasn't handled before. I actually don't know how a
bad pmd might arise in the wild (would love to actually know this),
but I don't see the check hurting (might be overly convervative
though).  Conversely, I'm not sure where things go astray currently if
the pmd is bad. Guess it depends in what way the flags are mutated.
Returning SCAN_PMD_NULL SGTM.

>
> > +       return SCAN_SUCCEED;
> > +}
> > +
> >  /*
> >   * Bring missing pages in from swap, to complete THP collapse.
> >   * Only done if khugepaged_scan_pmd believes it is worthwhile.
> > @@ -1146,11 +1172,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >
> >         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >
> > -       pmd = mm_find_pmd(mm, address);
> > -       if (!pmd) {
> > -               result = SCAN_PMD_NULL;
> > +       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > +       if (result != SCAN_SUCCEED)
>
> There are a couple of other callsites for mm_find_pmd(), you may need
> to change all of them to find_pmd_or_thp_or_none() for MADV_COLLAPSE
> since khugepaged may collapse the area before MADV_COLLAPSE
> reacquiring mmap_lock IIUC and MADV_COLLAPSE does care this case. It
> is fine w/o MADV_COLLAPSE since khupaged doesn't care if it is PMD
> mapped or not.

Ya, I was just questioning the same thing after responding above - at
least w.r.t whether the pmd_bad() also needs to be in these callsites
(check for pmd mapping, as you mention, I think is definitely
necessary). Thanks for catching this!

> So it may be better to move this patch right before MADV_COLLAPSE is introduced?

I think this should be ok - I'll give it a try at least.

Again, thank you for taking the time to thoroughly review this.

Best,
Zach

> >                 goto out;
> > -       }
> >
> >         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> >         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 04fac1af870b..c9979c6ad7a1 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -767,13 +767,12 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
> >         return vma_address(page, vma);
> >  }
> >
> > -pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)
>
> May be better to have some notes for mm_find_pmd_raw() and mm_find_pmd().
>
> >  {
> >         pgd_t *pgd;
> >         p4d_t *p4d;
> >         pud_t *pud;
> >         pmd_t *pmd = NULL;
> > -       pmd_t pmde;
> >
> >         pgd = pgd_offset(mm, address);
> >         if (!pgd_present(*pgd))
> > @@ -788,6 +787,18 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> >                 goto out;
> >
> >         pmd = pmd_offset(pud, address);
> > +out:
> > +       return pmd;
> > +}
> > +
> > +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > +{
> > +       pmd_t pmde;
> > +       pmd_t *pmd;
> > +
> > +       pmd = mm_find_pmd_raw(mm, address);
> > +       if (!pmd)
> > +               goto out;
> >         /*
> >          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
> >          * without holding anon_vma lock for write.  So when looking for a
> > --
> > 2.36.1.255.ge46751e96f-goog
> >
Zach O'Keefe June 7, 2022, 7:32 p.m. UTC | #3
On Tue, Jun 7, 2022 at 9:01 AM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Mon, Jun 6, 2022 at 1:46 PM Yang Shi <shy828301@gmail.com> wrote:
> >
> > On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > >
> > > When scanning an anon pmd to see if it's eligible for collapse, return
> > > SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
> > > SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
> > > file-collapse path, since the latter might identify pte-mapped compound
> > > pages.  This is required by MADV_COLLAPSE which necessarily needs to
> > > know what hugepage-aligned/sized regions are already pmd-mapped.
> > >
> > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > ---
> > >  include/trace/events/huge_memory.h |  1 +
> > >  mm/internal.h                      |  1 +
> > >  mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
> > >  mm/rmap.c                          | 15 ++++++++++++--
> > >  4 files changed, 43 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > > index d651f3437367..55392bf30a03 100644
> > > --- a/include/trace/events/huge_memory.h
> > > +++ b/include/trace/events/huge_memory.h
> > > @@ -11,6 +11,7 @@
> > >         EM( SCAN_FAIL,                  "failed")                       \
> > >         EM( SCAN_SUCCEED,               "succeeded")                    \
> > >         EM( SCAN_PMD_NULL,              "pmd_null")                     \
> > > +       EM( SCAN_PMD_MAPPED,            "page_pmd_mapped")              \
> > >         EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
> > >         EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
> > >         EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
> > > diff --git a/mm/internal.h b/mm/internal.h
> > > index 6e14749ad1e5..f768c7fae668 100644
> > > --- a/mm/internal.h
> > > +++ b/mm/internal.h
> > > @@ -188,6 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
> > >  /*
> > >   * in mm/rmap.c:
> > >   */
> > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
> > >  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
> > >
> > >  /*
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index cc3d6fb446d5..7a914ca19e96 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -28,6 +28,7 @@ enum scan_result {
> > >         SCAN_FAIL,
> > >         SCAN_SUCCEED,
> > >         SCAN_PMD_NULL,
> > > +       SCAN_PMD_MAPPED,
> > >         SCAN_EXCEED_NONE_PTE,
> > >         SCAN_EXCEED_SWAP_PTE,
> > >         SCAN_EXCEED_SHARED_PTE,
> > > @@ -901,6 +902,31 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > >         return 0;
> > >  }
> > >
> > > +static int find_pmd_or_thp_or_none(struct mm_struct *mm,
> > > +                                  unsigned long address,
> > > +                                  pmd_t **pmd)
> > > +{
> > > +       pmd_t pmde;
> > > +
> > > +       *pmd = mm_find_pmd_raw(mm, address);
> > > +       if (!*pmd)
> > > +               return SCAN_PMD_NULL;
> > > +
> > > +       pmde = pmd_read_atomic(*pmd);
> > > +
> > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > +       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
> > > +       barrier();
> > > +#endif
> > > +       if (!pmd_present(pmde))
> > > +               return SCAN_PMD_NULL;
> > > +       if (pmd_trans_huge(pmde))
> > > +               return SCAN_PMD_MAPPED;
> > > +       if (pmd_bad(pmde))
> > > +               return SCAN_FAIL;
> >
> > khugepaged doesn't handle pmd_bad before, IIRC it may just return
> > SCAN_SUCCEED if everything else is good? It is fine to add it, but it
> > may be better to return SCAN_PMD_NULL?
>
> Correct, pmd_bad() wasn't handled before. I actually don't know how a
> bad pmd might arise in the wild (would love to actually know this),
> but I don't see the check hurting (might be overly convervative
> though).  Conversely, I'm not sure where things go astray currently if
> the pmd is bad. Guess it depends in what way the flags are mutated.
> Returning SCAN_PMD_NULL SGTM.
>
> >
> > > +       return SCAN_SUCCEED;
> > > +}
> > > +
> > >  /*
> > >   * Bring missing pages in from swap, to complete THP collapse.
> > >   * Only done if khugepaged_scan_pmd believes it is worthwhile.
> > > @@ -1146,11 +1172,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > >
> > >         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > >
> > > -       pmd = mm_find_pmd(mm, address);
> > > -       if (!pmd) {
> > > -               result = SCAN_PMD_NULL;
> > > +       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > > +       if (result != SCAN_SUCCEED)
> >
> > There are a couple of other callsites for mm_find_pmd(), you may need
> > to change all of them to find_pmd_or_thp_or_none() for MADV_COLLAPSE
> > since khugepaged may collapse the area before MADV_COLLAPSE
> > reacquiring mmap_lock IIUC and MADV_COLLAPSE does care this case. It
> > is fine w/o MADV_COLLAPSE since khupaged doesn't care if it is PMD
> > mapped or not.
>
> Ya, I was just questioning the same thing after responding above - at
> least w.r.t whether the pmd_bad() also needs to be in these callsites
> (check for pmd mapping, as you mention, I think is definitely
> necessary). Thanks for catching this!
>
> > So it may be better to move this patch right before MADV_COLLAPSE is introduced?
>
> I think this should be ok - I'll give it a try at least.
>
> Again, thank you for taking the time to thoroughly review this.
>
> Best,
> Zach
>
> > >                 goto out;
> > > -       }
> > >
> > >         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> > >         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index 04fac1af870b..c9979c6ad7a1 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -767,13 +767,12 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
> > >         return vma_address(page, vma);
> > >  }
> > >
> > > -pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)
> >
> > May be better to have some notes for mm_find_pmd_raw() and mm_find_pmd().
> >

Agreed. Looking over this code again, there are only 3 users of mm_find_pmd():

1) khugepaged
2) ksm (replace_page())
3) split_huge_pmd_address()

Once khugepaged codepaths care about THP-pmds, ksm is the only
remaining user that really wants a pte-mapping pmd.

I've gone and consolidated the open-coded code in
split_huge_pmd_address() to use the mm_find_pmd_raw().

I've also done a name switch:

mm_find_pmd() -> mm_find_pte_pmd()
mm_find_pmd_raw() -> mm_find_pmd()

This basically reverts mm_find_pmd() to its pre commit f72e7dcdd252
("mm: let mm_find_pmd fix buggy race with THP fault")
behavior, and special cases (what will be, after MADV_COLLAPSE file
support) the only remaining callsite which *doesn't* care about
THP-pmds (ksm). The naming here is a little more meaningful than
"*raw", and IMHO more readable.


> > >  {
> > >         pgd_t *pgd;
> > >         p4d_t *p4d;
> > >         pud_t *pud;
> > >         pmd_t *pmd = NULL;
> > > -       pmd_t pmde;
> > >
> > >         pgd = pgd_offset(mm, address);
> > >         if (!pgd_present(*pgd))
> > > @@ -788,6 +787,18 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > >                 goto out;
> > >
> > >         pmd = pmd_offset(pud, address);
> > > +out:
> > > +       return pmd;
> > > +}
> > > +
> > > +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > +{
> > > +       pmd_t pmde;
> > > +       pmd_t *pmd;
> > > +
> > > +       pmd = mm_find_pmd_raw(mm, address);
> > > +       if (!pmd)
> > > +               goto out;
> > >         /*
> > >          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
> > >          * without holding anon_vma lock for write.  So when looking for a
> > > --
> > > 2.36.1.255.ge46751e96f-goog
> > >
Yang Shi June 7, 2022, 9:27 p.m. UTC | #4
On Tue, Jun 7, 2022 at 12:33 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Tue, Jun 7, 2022 at 9:01 AM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > On Mon, Jun 6, 2022 at 1:46 PM Yang Shi <shy828301@gmail.com> wrote:
> > >
> > > On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > >
> > > > When scanning an anon pmd to see if it's eligible for collapse, return
> > > > SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
> > > > SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
> > > > file-collapse path, since the latter might identify pte-mapped compound
> > > > pages.  This is required by MADV_COLLAPSE which necessarily needs to
> > > > know what hugepage-aligned/sized regions are already pmd-mapped.
> > > >
> > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > ---
> > > >  include/trace/events/huge_memory.h |  1 +
> > > >  mm/internal.h                      |  1 +
> > > >  mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
> > > >  mm/rmap.c                          | 15 ++++++++++++--
> > > >  4 files changed, 43 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > > > index d651f3437367..55392bf30a03 100644
> > > > --- a/include/trace/events/huge_memory.h
> > > > +++ b/include/trace/events/huge_memory.h
> > > > @@ -11,6 +11,7 @@
> > > >         EM( SCAN_FAIL,                  "failed")                       \
> > > >         EM( SCAN_SUCCEED,               "succeeded")                    \
> > > >         EM( SCAN_PMD_NULL,              "pmd_null")                     \
> > > > +       EM( SCAN_PMD_MAPPED,            "page_pmd_mapped")              \
> > > >         EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
> > > >         EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
> > > >         EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
> > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > index 6e14749ad1e5..f768c7fae668 100644
> > > > --- a/mm/internal.h
> > > > +++ b/mm/internal.h
> > > > @@ -188,6 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
> > > >  /*
> > > >   * in mm/rmap.c:
> > > >   */
> > > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
> > > >  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
> > > >
> > > >  /*
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index cc3d6fb446d5..7a914ca19e96 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -28,6 +28,7 @@ enum scan_result {
> > > >         SCAN_FAIL,
> > > >         SCAN_SUCCEED,
> > > >         SCAN_PMD_NULL,
> > > > +       SCAN_PMD_MAPPED,
> > > >         SCAN_EXCEED_NONE_PTE,
> > > >         SCAN_EXCEED_SWAP_PTE,
> > > >         SCAN_EXCEED_SHARED_PTE,
> > > > @@ -901,6 +902,31 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > >         return 0;
> > > >  }
> > > >
> > > > +static int find_pmd_or_thp_or_none(struct mm_struct *mm,
> > > > +                                  unsigned long address,
> > > > +                                  pmd_t **pmd)
> > > > +{
> > > > +       pmd_t pmde;
> > > > +
> > > > +       *pmd = mm_find_pmd_raw(mm, address);
> > > > +       if (!*pmd)
> > > > +               return SCAN_PMD_NULL;
> > > > +
> > > > +       pmde = pmd_read_atomic(*pmd);
> > > > +
> > > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > > +       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
> > > > +       barrier();
> > > > +#endif
> > > > +       if (!pmd_present(pmde))
> > > > +               return SCAN_PMD_NULL;
> > > > +       if (pmd_trans_huge(pmde))
> > > > +               return SCAN_PMD_MAPPED;
> > > > +       if (pmd_bad(pmde))
> > > > +               return SCAN_FAIL;
> > >
> > > khugepaged doesn't handle pmd_bad before, IIRC it may just return
> > > SCAN_SUCCEED if everything else is good? It is fine to add it, but it
> > > may be better to return SCAN_PMD_NULL?
> >
> > Correct, pmd_bad() wasn't handled before. I actually don't know how a
> > bad pmd might arise in the wild (would love to actually know this),
> > but I don't see the check hurting (might be overly convervative
> > though).  Conversely, I'm not sure where things go astray currently if
> > the pmd is bad. Guess it depends in what way the flags are mutated.
> > Returning SCAN_PMD_NULL SGTM.
> >
> > >
> > > > +       return SCAN_SUCCEED;
> > > > +}
> > > > +
> > > >  /*
> > > >   * Bring missing pages in from swap, to complete THP collapse.
> > > >   * Only done if khugepaged_scan_pmd believes it is worthwhile.
> > > > @@ -1146,11 +1172,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > > >
> > > >         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > >
> > > > -       pmd = mm_find_pmd(mm, address);
> > > > -       if (!pmd) {
> > > > -               result = SCAN_PMD_NULL;
> > > > +       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > > > +       if (result != SCAN_SUCCEED)
> > >
> > > There are a couple of other callsites for mm_find_pmd(), you may need
> > > to change all of them to find_pmd_or_thp_or_none() for MADV_COLLAPSE
> > > since khugepaged may collapse the area before MADV_COLLAPSE
> > > reacquiring mmap_lock IIUC and MADV_COLLAPSE does care this case. It
> > > is fine w/o MADV_COLLAPSE since khupaged doesn't care if it is PMD
> > > mapped or not.
> >
> > Ya, I was just questioning the same thing after responding above - at
> > least w.r.t whether the pmd_bad() also needs to be in these callsites
> > (check for pmd mapping, as you mention, I think is definitely
> > necessary). Thanks for catching this!
> >
> > > So it may be better to move this patch right before MADV_COLLAPSE is introduced?
> >
> > I think this should be ok - I'll give it a try at least.
> >
> > Again, thank you for taking the time to thoroughly review this.
> >
> > Best,
> > Zach
> >
> > > >                 goto out;
> > > > -       }
> > > >
> > > >         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> > > >         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > index 04fac1af870b..c9979c6ad7a1 100644
> > > > --- a/mm/rmap.c
> > > > +++ b/mm/rmap.c
> > > > @@ -767,13 +767,12 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
> > > >         return vma_address(page, vma);
> > > >  }
> > > >
> > > > -pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)
> > >
> > > May be better to have some notes for mm_find_pmd_raw() and mm_find_pmd().
> > >
>
> Agreed. Looking over this code again, there are only 3 users of mm_find_pmd():
>
> 1) khugepaged
> 2) ksm (replace_page())
> 3) split_huge_pmd_address()
>
> Once khugepaged codepaths care about THP-pmds, ksm is the only
> remaining user that really wants a pte-mapping pmd.
>
> I've gone and consolidated the open-coded code in
> split_huge_pmd_address() to use the mm_find_pmd_raw().
>
> I've also done a name switch:
>
> mm_find_pmd() -> mm_find_pte_pmd()
> mm_find_pmd_raw() -> mm_find_pmd()

If ksm is the only user of *current* mm_find_pmd(), I think you should
be able to open code it w/o introducing mm_find_pte_pmd() and revert
mm_find_pmd() to its *old* behavior.

>
> This basically reverts mm_find_pmd() to its pre commit f72e7dcdd252
> ("mm: let mm_find_pmd fix buggy race with THP fault")
> behavior, and special cases (what will be, after MADV_COLLAPSE file
> support) the only remaining callsite which *doesn't* care about
> THP-pmds (ksm). The naming here is a little more meaningful than
> "*raw", and IMHO more readable.
>
>
> > > >  {
> > > >         pgd_t *pgd;
> > > >         p4d_t *p4d;
> > > >         pud_t *pud;
> > > >         pmd_t *pmd = NULL;
> > > > -       pmd_t pmde;
> > > >
> > > >         pgd = pgd_offset(mm, address);
> > > >         if (!pgd_present(*pgd))
> > > > @@ -788,6 +787,18 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > >                 goto out;
> > > >
> > > >         pmd = pmd_offset(pud, address);
> > > > +out:
> > > > +       return pmd;
> > > > +}
> > > > +
> > > > +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > > +{
> > > > +       pmd_t pmde;
> > > > +       pmd_t *pmd;
> > > > +
> > > > +       pmd = mm_find_pmd_raw(mm, address);
> > > > +       if (!pmd)
> > > > +               goto out;
> > > >         /*
> > > >          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
> > > >          * without holding anon_vma lock for write.  So when looking for a
> > > > --
> > > > 2.36.1.255.ge46751e96f-goog
> > > >
Zach O'Keefe June 8, 2022, 12:27 a.m. UTC | #5
On Tue, Jun 7, 2022 at 2:27 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 12:33 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > On Tue, Jun 7, 2022 at 9:01 AM Zach O'Keefe <zokeefe@google.com> wrote:
> > >
> > > On Mon, Jun 6, 2022 at 1:46 PM Yang Shi <shy828301@gmail.com> wrote:
> > > >
> > > > On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > >
> > > > > When scanning an anon pmd to see if it's eligible for collapse, return
> > > > > SCAN_PMD_MAPPED if the pmd already maps a THP.  Note that
> > > > > SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
> > > > > file-collapse path, since the latter might identify pte-mapped compound
> > > > > pages.  This is required by MADV_COLLAPSE which necessarily needs to
> > > > > know what hugepage-aligned/sized regions are already pmd-mapped.
> > > > >
> > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > > ---
> > > > >  include/trace/events/huge_memory.h |  1 +
> > > > >  mm/internal.h                      |  1 +
> > > > >  mm/khugepaged.c                    | 32 ++++++++++++++++++++++++++----
> > > > >  mm/rmap.c                          | 15 ++++++++++++--
> > > > >  4 files changed, 43 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > > > > index d651f3437367..55392bf30a03 100644
> > > > > --- a/include/trace/events/huge_memory.h
> > > > > +++ b/include/trace/events/huge_memory.h
> > > > > @@ -11,6 +11,7 @@
> > > > >         EM( SCAN_FAIL,                  "failed")                       \
> > > > >         EM( SCAN_SUCCEED,               "succeeded")                    \
> > > > >         EM( SCAN_PMD_NULL,              "pmd_null")                     \
> > > > > +       EM( SCAN_PMD_MAPPED,            "page_pmd_mapped")              \
> > > > >         EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
> > > > >         EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
> > > > >         EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
> > > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > > index 6e14749ad1e5..f768c7fae668 100644
> > > > > --- a/mm/internal.h
> > > > > +++ b/mm/internal.h
> > > > > @@ -188,6 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
> > > > >  /*
> > > > >   * in mm/rmap.c:
> > > > >   */
> > > > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
> > > > >  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
> > > > >
> > > > >  /*
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index cc3d6fb446d5..7a914ca19e96 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -28,6 +28,7 @@ enum scan_result {
> > > > >         SCAN_FAIL,
> > > > >         SCAN_SUCCEED,
> > > > >         SCAN_PMD_NULL,
> > > > > +       SCAN_PMD_MAPPED,
> > > > >         SCAN_EXCEED_NONE_PTE,
> > > > >         SCAN_EXCEED_SWAP_PTE,
> > > > >         SCAN_EXCEED_SHARED_PTE,
> > > > > @@ -901,6 +902,31 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > +static int find_pmd_or_thp_or_none(struct mm_struct *mm,
> > > > > +                                  unsigned long address,
> > > > > +                                  pmd_t **pmd)
> > > > > +{
> > > > > +       pmd_t pmde;
> > > > > +
> > > > > +       *pmd = mm_find_pmd_raw(mm, address);
> > > > > +       if (!*pmd)
> > > > > +               return SCAN_PMD_NULL;
> > > > > +
> > > > > +       pmde = pmd_read_atomic(*pmd);
> > > > > +
> > > > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > > > +       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
> > > > > +       barrier();
> > > > > +#endif
> > > > > +       if (!pmd_present(pmde))
> > > > > +               return SCAN_PMD_NULL;
> > > > > +       if (pmd_trans_huge(pmde))
> > > > > +               return SCAN_PMD_MAPPED;
> > > > > +       if (pmd_bad(pmde))
> > > > > +               return SCAN_FAIL;
> > > >
> > > > khugepaged doesn't handle pmd_bad before, IIRC it may just return
> > > > SCAN_SUCCEED if everything else is good? It is fine to add it, but it
> > > > may be better to return SCAN_PMD_NULL?
> > >
> > > Correct, pmd_bad() wasn't handled before. I actually don't know how a
> > > bad pmd might arise in the wild (would love to actually know this),
> > > but I don't see the check hurting (might be overly convervative
> > > though).  Conversely, I'm not sure where things go astray currently if
> > > the pmd is bad. Guess it depends in what way the flags are mutated.
> > > Returning SCAN_PMD_NULL SGTM.
> > >
> > > >
> > > > > +       return SCAN_SUCCEED;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Bring missing pages in from swap, to complete THP collapse.
> > > > >   * Only done if khugepaged_scan_pmd believes it is worthwhile.
> > > > > @@ -1146,11 +1172,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > > > >
> > > > >         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > > >
> > > > > -       pmd = mm_find_pmd(mm, address);
> > > > > -       if (!pmd) {
> > > > > -               result = SCAN_PMD_NULL;
> > > > > +       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > > > > +       if (result != SCAN_SUCCEED)
> > > >
> > > > There are a couple of other callsites for mm_find_pmd(), you may need
> > > > to change all of them to find_pmd_or_thp_or_none() for MADV_COLLAPSE
> > > > since khugepaged may collapse the area before MADV_COLLAPSE
> > > > reacquiring mmap_lock IIUC and MADV_COLLAPSE does care this case. It
> > > > is fine w/o MADV_COLLAPSE since khupaged doesn't care if it is PMD
> > > > mapped or not.
> > >
> > > Ya, I was just questioning the same thing after responding above - at
> > > least w.r.t whether the pmd_bad() also needs to be in these callsites
> > > (check for pmd mapping, as you mention, I think is definitely
> > > necessary). Thanks for catching this!
> > >
> > > > So it may be better to move this patch right before MADV_COLLAPSE is introduced?
> > >
> > > I think this should be ok - I'll give it a try at least.
> > >
> > > Again, thank you for taking the time to thoroughly review this.
> > >
> > > Best,
> > > Zach
> > >
> > > > >                 goto out;
> > > > > -       }
> > > > >
> > > > >         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> > > > >         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > > index 04fac1af870b..c9979c6ad7a1 100644
> > > > > --- a/mm/rmap.c
> > > > > +++ b/mm/rmap.c
> > > > > @@ -767,13 +767,12 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
> > > > >         return vma_address(page, vma);
> > > > >  }
> > > > >
> > > > > -pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > > > +pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)
> > > >
> > > > May be better to have some notes for mm_find_pmd_raw() and mm_find_pmd().
> > > >
> >
> > Agreed. Looking over this code again, there are only 3 users of mm_find_pmd():
> >
> > 1) khugepaged
> > 2) ksm (replace_page())
> > 3) split_huge_pmd_address()
> >
> > Once khugepaged codepaths care about THP-pmds, ksm is the only
> > remaining user that really wants a pte-mapping pmd.
> >
> > I've gone and consolidated the open-coded code in
> > split_huge_pmd_address() to use the mm_find_pmd_raw().
> >
> > I've also done a name switch:
> >
> > mm_find_pmd() -> mm_find_pte_pmd()
> > mm_find_pmd_raw() -> mm_find_pmd()
>
> If ksm is the only user of *current* mm_find_pmd(), I think you should
> be able to open code it w/o introducing mm_find_pte_pmd() and revert
> mm_find_pmd() to its *old* behavior.

SGTM. Tried it out and it looks fine. Thanks for the suggestion.

> >
> > This basically reverts mm_find_pmd() to its pre commit f72e7dcdd252
> > ("mm: let mm_find_pmd fix buggy race with THP fault")
> > behavior, and special cases (what will be, after MADV_COLLAPSE file
> > support) the only remaining callsite which *doesn't* care about
> > THP-pmds (ksm). The naming here is a little more meaningful than
> > "*raw", and IMHO more readable.
> >
> >
> > > > >  {
> > > > >         pgd_t *pgd;
> > > > >         p4d_t *p4d;
> > > > >         pud_t *pud;
> > > > >         pmd_t *pmd = NULL;
> > > > > -       pmd_t pmde;
> > > > >
> > > > >         pgd = pgd_offset(mm, address);
> > > > >         if (!pgd_present(*pgd))
> > > > > @@ -788,6 +787,18 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > > >                 goto out;
> > > > >
> > > > >         pmd = pmd_offset(pud, address);
> > > > > +out:
> > > > > +       return pmd;
> > > > > +}
> > > > > +
> > > > > +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
> > > > > +{
> > > > > +       pmd_t pmde;
> > > > > +       pmd_t *pmd;
> > > > > +
> > > > > +       pmd = mm_find_pmd_raw(mm, address);
> > > > > +       if (!pmd)
> > > > > +               goto out;
> > > > >         /*
> > > > >          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
> > > > >          * without holding anon_vma lock for write.  So when looking for a
> > > > > --
> > > > > 2.36.1.255.ge46751e96f-goog
> > > > >
diff mbox series

Patch

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index d651f3437367..55392bf30a03 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -11,6 +11,7 @@ 
 	EM( SCAN_FAIL,			"failed")			\
 	EM( SCAN_SUCCEED,		"succeeded")			\
 	EM( SCAN_PMD_NULL,		"pmd_null")			\
+	EM( SCAN_PMD_MAPPED,		"page_pmd_mapped")		\
 	EM( SCAN_EXCEED_NONE_PTE,	"exceed_none_pte")		\
 	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
 	EM( SCAN_EXCEED_SHARED_PTE,	"exceed_shared_pte")		\
diff --git a/mm/internal.h b/mm/internal.h
index 6e14749ad1e5..f768c7fae668 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -188,6 +188,7 @@  extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
 /*
  * in mm/rmap.c:
  */
+pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address);
 extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 
 /*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cc3d6fb446d5..7a914ca19e96 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,7 @@  enum scan_result {
 	SCAN_FAIL,
 	SCAN_SUCCEED,
 	SCAN_PMD_NULL,
+	SCAN_PMD_MAPPED,
 	SCAN_EXCEED_NONE_PTE,
 	SCAN_EXCEED_SWAP_PTE,
 	SCAN_EXCEED_SHARED_PTE,
@@ -901,6 +902,31 @@  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 	return 0;
 }
 
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+				   unsigned long address,
+				   pmd_t **pmd)
+{
+	pmd_t pmde;
+
+	*pmd = mm_find_pmd_raw(mm, address);
+	if (!*pmd)
+		return SCAN_PMD_NULL;
+
+	pmde = pmd_read_atomic(*pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+	barrier();
+#endif
+	if (!pmd_present(pmde))
+		return SCAN_PMD_NULL;
+	if (pmd_trans_huge(pmde))
+		return SCAN_PMD_MAPPED;
+	if (pmd_bad(pmde))
+		return SCAN_FAIL;
+	return SCAN_SUCCEED;
+}
+
 /*
  * Bring missing pages in from swap, to complete THP collapse.
  * Only done if khugepaged_scan_pmd believes it is worthwhile.
@@ -1146,11 +1172,9 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd) {
-		result = SCAN_PMD_NULL;
+	result = find_pmd_or_thp_or_none(mm, address, &pmd);
+	if (result != SCAN_SUCCEED)
 		goto out;
-	}
 
 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/rmap.c b/mm/rmap.c
index 04fac1af870b..c9979c6ad7a1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -767,13 +767,12 @@  unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 	return vma_address(page, vma);
 }
 
-pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+pmd_t *mm_find_pmd_raw(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd = NULL;
-	pmd_t pmde;
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -788,6 +787,18 @@  pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 		goto out;
 
 	pmd = pmd_offset(pud, address);
+out:
+	return pmd;
+}
+
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+	pmd_t pmde;
+	pmd_t *pmd;
+
+	pmd = mm_find_pmd_raw(mm, address);
+	if (!pmd)
+		goto out;
 	/*
 	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
 	 * without holding anon_vma lock for write.  So when looking for a