diff mbox series

[2/7] mm/thp: try_to_unmap() use TTU_SYNC for safe DEBUG_VM splitting

Message ID alpine.LSU.2.11.2106011405510.2148@eggly.anvils (mailing list archive)
State New, archived
Headers show
Series mm/thp: fix THP splitting unmap BUGs and related | expand

Commit Message

Hugh Dickins June 1, 2021, 9:07 p.m. UTC
Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
(!unmap_success): with dump_page() showing mapcount:1, but then its
raw struct page output showing _mapcount ffffffff i.e. mapcount 0.

And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed,
it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)),
and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG():
all indicative of some mapcount difficulty in development here perhaps.
But the !CONFIG_DEBUG_VM path handles the failures correctly and silently.

I believe the problem is that once a racing unmap has cleared pte or pmd,
try_to_unmap_one() may skip taking the page table lock, and emerge from
try_to_unmap() before the racing task has reached decrementing mapcount.

Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
to the options, and passing that from unmap_page() when CONFIG_DEBUG_VM=y.
It could be passed in the non-debug case too, but that would sometimes add
a little overhead, whereas it's rare for this race to result in failure.

mm/memory-failure.c:hwpoison_user_mappings() should probably use the new
TTU_SYNC option too, just in case this race coincides with its attempts to
unmap a failing page (THP or not); but this commit does not add that.

Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
---
 include/linux/rmap.h |  3 ++-
 mm/huge_memory.c     |  4 ++++
 mm/page_vma_mapped.c |  8 ++++++++
 mm/rmap.c            | 17 ++++++++++++++++-
 4 files changed, 30 insertions(+), 2 deletions(-)

Comments

Alistair Popple June 2, 2021, 1:59 a.m. UTC | #1
On Wednesday, 2 June 2021 7:07:53 AM AEST Hugh Dickins wrote:
> External email: Use caution opening links or attachments
> 
> 
> Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
> (!unmap_success): with dump_page() showing mapcount:1, but then its
> raw struct page output showing _mapcount ffffffff i.e. mapcount 0.
> 
> And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed,
> it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)),
> and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG():
> all indicative of some mapcount difficulty in development here perhaps.
> But the !CONFIG_DEBUG_VM path handles the failures correctly and silently.
> 
> I believe the problem is that once a racing unmap has cleared pte or pmd,
> try_to_unmap_one() may skip taking the page table lock, and emerge from
> try_to_unmap() before the racing task has reached decrementing mapcount.
> 
> Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
> follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
> to the options, and passing that from unmap_page() when CONFIG_DEBUG_VM=y.
> It could be passed in the non-debug case too, but that would sometimes add
> a little overhead, whereas it's rare for this race to result in failure.
> 
> mm/memory-failure.c:hwpoison_user_mappings() should probably use the new
> TTU_SYNC option too, just in case this race coincides with its attempts to
> unmap a failing page (THP or not); but this commit does not add that.
> 
> Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with
> generic rmap walkers") Signed-off-by: Hugh Dickins <hughd@google.com>
> Cc: <stable@vger.kernel.org>
> ---
>  include/linux/rmap.h |  3 ++-
>  mm/huge_memory.c     |  4 ++++
>  mm/page_vma_mapped.c |  8 ++++++++
>  mm/rmap.c            | 17 ++++++++++++++++-
>  4 files changed, 30 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..891599a4cb8d 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -97,7 +97,8 @@ enum ttu_flags {
>                                          * do a final flush if necessary */
>         TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
>                                          * caller holds it */
> -       TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under
> splitting thp */ +       TTU_SPLIT_FREEZE        = 0x100, /* freeze pte
> under splitting thp */ +       TTU_SYNC                = 0x200, /* avoid
> racy checks with PVMW_SYNC */ };
> 
>  #ifdef CONFIG_MMU
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9fb7b47da87e..305f709a7aca 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2357,6 +2357,10 @@ static void unmap_page(struct page *page)
>         if (PageAnon(page))
>                 ttu_flags |= TTU_SPLIT_FREEZE;
> 
> +       /* Make sure that the BUGs will not bite */
> +       if (IS_ENABLED(CONFIG_DEBUG_VM))
> +               ttu_flags |= TTU_SYNC;
> +
>         unmap_success = try_to_unmap(page, ttu_flags);
>         VM_BUG_ON_PAGE(!unmap_success, page);
>  }
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 2cf01d933f13..b45d22738b45 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk
> *pvmw) pvmw->ptl = NULL;
>                 }
>         } else if (!pmd_present(pmde)) {
> +               /*
> +                * If PVMW_SYNC, take and drop THP pmd lock so that we
> +                * cannot return prematurely, while zap_huge_pmd() has
> +                * cleared *pmd but not decremented compound_mapcount().
> +                */
> +               if ((pvmw->flags & PVMW_SYNC) &&
> +                   PageTransCompound(pvmw->page))
> +                       spin_unlock(pmd_lock(mm, pvmw->pmd));
>                 return false;
>         }
>         if (!map_pte(pvmw))
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 693a610e181d..07811b4ae793 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1405,6 +1405,15 @@ static bool try_to_unmap_one(struct page *page,
> struct vm_area_struct *vma, struct mmu_notifier_range range;
>         enum ttu_flags flags = (enum ttu_flags)(long)arg;
> 
> +       /*
> +        * When racing against e.g. zap_pte_range() on another cpu,
> +        * in between its ptep_get_and_clear_full() and page_remove_rmap(),
> +        * try_to_unmap() may return false when it is about to become true,
> +        * if page table locking is skipped: use TTU_SYNC to wait for that.
> +        */
> +       if (flags & TTU_SYNC)
> +               pvmw.flags = PVMW_SYNC;
> +

If this gets applied on top of my series then I think we would also need to 
add this to the start of try_to_migrate_one() as I assume you can hit this bug 
regardless of whether unmapping vs. installing swap migration entries.

We would also need to update the flag check at the start of try_to_migrate() 
to allow passing TTU_SYNC.

>         /* munlock has nothing to gain from examining un-locked vmas */
>         if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
>                 return true;
> @@ -1777,7 +1786,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags
> flags) else
>                 rmap_walk(page, &rwc);
> 
> -       return !page_mapcount(page) ? true : false;
> +       /*
> +        * When racing against e.g. zap_pte_range() on another cpu,
> +        * in between its ptep_get_and_clear_full() and page_remove_rmap(),
> +        * try_to_unmap() may return false when it is about to become true,
> +        * if page table locking is skipped: use TTU_SYNC to wait for that.
> +        */
> +       return !page_mapcount(page);
>  }
> 
>  /**
> --
> 2.32.0.rc0.204.g9fa02ecfa5-goog
Yang Shi June 3, 2021, 9:45 p.m. UTC | #2
On Tue, Jun 1, 2021 at 2:07 PM Hugh Dickins <hughd@google.com> wrote:
>
> Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
> (!unmap_success): with dump_page() showing mapcount:1, but then its
> raw struct page output showing _mapcount ffffffff i.e. mapcount 0.
>
> And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed,
> it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)),
> and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG():
> all indicative of some mapcount difficulty in development here perhaps.
> But the !CONFIG_DEBUG_VM path handles the failures correctly and silently.
>
> I believe the problem is that once a racing unmap has cleared pte or pmd,
> try_to_unmap_one() may skip taking the page table lock, and emerge from
> try_to_unmap() before the racing task has reached decrementing mapcount.
>
> Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
> follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
> to the options, and passing that from unmap_page() when CONFIG_DEBUG_VM=y.
> It could be passed in the non-debug case too, but that would sometimes add
> a little overhead, whereas it's rare for this race to result in failure.

The above statement makes me feel this patch is just to relieve the
VM_BUG_ON, but my patch already changed it to VM_WARN, the race sounds
acceptable (at least not fatal) and the splitting code can handle the
failure case as well. So I'm wondering if we still need this patch or
not if it is just used to close the race when CONFIG_DEBUG_VM=y.

>
> mm/memory-failure.c:hwpoison_user_mappings() should probably use the new
> TTU_SYNC option too, just in case this race coincides with its attempts to
> unmap a failing page (THP or not); but this commit does not add that.
>
> Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> Cc: <stable@vger.kernel.org>
> ---
>  include/linux/rmap.h |  3 ++-
>  mm/huge_memory.c     |  4 ++++
>  mm/page_vma_mapped.c |  8 ++++++++
>  mm/rmap.c            | 17 ++++++++++++++++-
>  4 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..891599a4cb8d 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -97,7 +97,8 @@ enum ttu_flags {
>                                          * do a final flush if necessary */
>         TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
>                                          * caller holds it */
> -       TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under splitting thp */
> +       TTU_SPLIT_FREEZE        = 0x100, /* freeze pte under splitting thp */
> +       TTU_SYNC                = 0x200, /* avoid racy checks with PVMW_SYNC */
>  };
>
>  #ifdef CONFIG_MMU
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9fb7b47da87e..305f709a7aca 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2357,6 +2357,10 @@ static void unmap_page(struct page *page)
>         if (PageAnon(page))
>                 ttu_flags |= TTU_SPLIT_FREEZE;
>
> +       /* Make sure that the BUGs will not bite */
> +       if (IS_ENABLED(CONFIG_DEBUG_VM))
> +               ttu_flags |= TTU_SYNC;
> +
>         unmap_success = try_to_unmap(page, ttu_flags);
>         VM_BUG_ON_PAGE(!unmap_success, page);
>  }
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 2cf01d933f13..b45d22738b45 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>                         pvmw->ptl = NULL;
>                 }
>         } else if (!pmd_present(pmde)) {
> +               /*
> +                * If PVMW_SYNC, take and drop THP pmd lock so that we
> +                * cannot return prematurely, while zap_huge_pmd() has
> +                * cleared *pmd but not decremented compound_mapcount().
> +                */
> +               if ((pvmw->flags & PVMW_SYNC) &&
> +                   PageTransCompound(pvmw->page))
> +                       spin_unlock(pmd_lock(mm, pvmw->pmd));
>                 return false;
>         }
>         if (!map_pte(pvmw))
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 693a610e181d..07811b4ae793 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1405,6 +1405,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>         struct mmu_notifier_range range;
>         enum ttu_flags flags = (enum ttu_flags)(long)arg;
>
> +       /*
> +        * When racing against e.g. zap_pte_range() on another cpu,
> +        * in between its ptep_get_and_clear_full() and page_remove_rmap(),
> +        * try_to_unmap() may return false when it is about to become true,
> +        * if page table locking is skipped: use TTU_SYNC to wait for that.
> +        */
> +       if (flags & TTU_SYNC)
> +               pvmw.flags = PVMW_SYNC;
> +
>         /* munlock has nothing to gain from examining un-locked vmas */
>         if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
>                 return true;
> @@ -1777,7 +1786,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
>         else
>                 rmap_walk(page, &rwc);
>
> -       return !page_mapcount(page) ? true : false;
> +       /*
> +        * When racing against e.g. zap_pte_range() on another cpu,
> +        * in between its ptep_get_and_clear_full() and page_remove_rmap(),
> +        * try_to_unmap() may return false when it is about to become true,
> +        * if page table locking is skipped: use TTU_SYNC to wait for that.
> +        */
> +       return !page_mapcount(page);
>  }
>
>  /**
> --
> 2.32.0.rc0.204.g9fa02ecfa5-goog
>
Peter Xu June 3, 2021, 9:48 p.m. UTC | #3
On Tue, Jun 01, 2021 at 02:07:53PM -0700, Hugh Dickins wrote:
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 2cf01d933f13..b45d22738b45 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  			pvmw->ptl = NULL;
>  		}
>  	} else if (!pmd_present(pmde)) {
> +		/*
> +		 * If PVMW_SYNC, take and drop THP pmd lock so that we
> +		 * cannot return prematurely, while zap_huge_pmd() has
> +		 * cleared *pmd but not decremented compound_mapcount().
> +		 */
> +		if ((pvmw->flags & PVMW_SYNC) &&
> +		    PageTransCompound(pvmw->page))
> +			spin_unlock(pmd_lock(mm, pvmw->pmd));
>  		return false;
>  	}
>  	if (!map_pte(pvmw))

Sorry if I missed something important, but I'm totally confused on how this
unlock is pairing with another lock()..

And.. isn't PVMW_SYNC only meaningful for pte-level only (as I didn't see a
reference of it outside map_pte)?
Hugh Dickins June 4, 2021, 2:45 a.m. UTC | #4
On Thu, 3 Jun 2021, Yang Shi wrote:
> On Tue, Jun 1, 2021 at 2:07 PM Hugh Dickins <hughd@google.com> wrote:
> >
> > Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
> > follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
> > to the options, and passing that from unmap_page() when CONFIG_DEBUG_VM=y.
> > It could be passed in the non-debug case too, but that would sometimes add
> > a little overhead, whereas it's rare for this race to result in failure.
> 
> The above statement makes me feel this patch is just to relieve the
> VM_BUG_ON, but my patch already changed it to VM_WARN, the race sounds
> acceptable (at least not fatal) and the splitting code can handle the
> failure case as well. So I'm wondering if we still need this patch or
> not if it is just used to close the race when CONFIG_DEBUG_VM=y.

I do agree that your 1/2 (what I'm calling 6.1/7) BUG->WARN patch
is the most important of them; but it didn't get marked for stable,
and has got placed behind conflicting mods never intended for stable.

And a lot of the descriptions had been written in terms of the prior
situation, with VM BUG there: it was easier to keep describing that way.

Whether your fix makes mine redundant is arguable (Wang Yugui thinks
not).  It's easier to argue that it makes the racy ones (like this)
redundant, than the persistent ones (like vma_address or pvm_walk).

Since I know of at least one customer who wants all these fixes in 5.10
longterm, I'm fighting to get them that far at least.  But the further
back they go, the less effort I'll make to backport them - will fall
back to porting your BUG->WARN only.

Hugh
Hugh Dickins June 4, 2021, 2:54 a.m. UTC | #5
On Thu, 3 Jun 2021, Peter Xu wrote:
> On Tue, Jun 01, 2021 at 02:07:53PM -0700, Hugh Dickins wrote:
> > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > index 2cf01d933f13..b45d22738b45 100644
> > --- a/mm/page_vma_mapped.c
> > +++ b/mm/page_vma_mapped.c
> > @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> >  			pvmw->ptl = NULL;
> >  		}
> >  	} else if (!pmd_present(pmde)) {
> > +		/*
> > +		 * If PVMW_SYNC, take and drop THP pmd lock so that we
> > +		 * cannot return prematurely, while zap_huge_pmd() has
> > +		 * cleared *pmd but not decremented compound_mapcount().
> > +		 */
> > +		if ((pvmw->flags & PVMW_SYNC) &&
> > +		    PageTransCompound(pvmw->page))
> > +			spin_unlock(pmd_lock(mm, pvmw->pmd));
> >  		return false;
> >  	}
> >  	if (!map_pte(pvmw))
> 
> Sorry if I missed something important, but I'm totally confused on how this
> unlock is pairing with another lock()..

I imagine you're reading that as spin_unlock(pmd_lockptr(blah));
no, the lock is right there, inside spin_unlock(pmd_lock(blah)).

> 
> And.. isn't PVMW_SYNC only meaningful for pte-level only (as I didn't see a
> reference of it outside map_pte)?

But you are pointing directly to its reference outside map_pte()!

Hugh
Peter Xu June 4, 2021, 2:48 p.m. UTC | #6
On Thu, Jun 03, 2021 at 07:54:11PM -0700, Hugh Dickins wrote:
> On Thu, 3 Jun 2021, Peter Xu wrote:
> > On Tue, Jun 01, 2021 at 02:07:53PM -0700, Hugh Dickins wrote:
> > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > index 2cf01d933f13..b45d22738b45 100644
> > > --- a/mm/page_vma_mapped.c
> > > +++ b/mm/page_vma_mapped.c
> > > @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > >  			pvmw->ptl = NULL;
> > >  		}
> > >  	} else if (!pmd_present(pmde)) {
> > > +		/*
> > > +		 * If PVMW_SYNC, take and drop THP pmd lock so that we
> > > +		 * cannot return prematurely, while zap_huge_pmd() has
> > > +		 * cleared *pmd but not decremented compound_mapcount().
> > > +		 */
> > > +		if ((pvmw->flags & PVMW_SYNC) &&
> > > +		    PageTransCompound(pvmw->page))
> > > +			spin_unlock(pmd_lock(mm, pvmw->pmd));
> > >  		return false;
> > >  	}
> > >  	if (!map_pte(pvmw))
> > 
> > Sorry if I missed something important, but I'm totally confused on how this
> > unlock is pairing with another lock()..
> 
> I imagine you're reading that as spin_unlock(pmd_lockptr(blah));
> no, the lock is right there, inside spin_unlock(pmd_lock(blah)).

Heh, yeah... Sorry about that.

> 
> > 
> > And.. isn't PVMW_SYNC only meaningful for pte-level only (as I didn't see a
> > reference of it outside map_pte)?
> 
> But you are pointing directly to its reference outside map_pte()!

Right, I was trying to look for the lock() so I needed to look at all the rest
besides this one. :)

I didn't follow Yang's patch, but if Yang's patch can make kernel not crashing
and fault handling done all well, then I'm kind of agree with him: having
workaround code (like taking lock and quickly releasing..) only for debug code
seems an overkill to me, not to mention that the debug code will be even more
strict after this patch, as it means it's even less likely that one can
reproduce one production host race with DEBUG_VM..  Normally debugging code
would affect code execution already, and for this one we're enlarging that gap
"explicitly" - not sure whether it's good.

This also makes me curious what if we make !DEBUG_VM strict too - how much perf
we'll lose?  Haven't even tried to think about it with details, but just raise
it up. Say, is there any chance we make the debug/non-debug paths run the same
logic (e.g. of SYNC version)?
Kirill A . Shutemov June 4, 2021, 3:47 p.m. UTC | #7
On Thu, Jun 03, 2021 at 07:54:11PM -0700, Hugh Dickins wrote:
> On Thu, 3 Jun 2021, Peter Xu wrote:
> > On Tue, Jun 01, 2021 at 02:07:53PM -0700, Hugh Dickins wrote:
> > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > index 2cf01d933f13..b45d22738b45 100644
> > > --- a/mm/page_vma_mapped.c
> > > +++ b/mm/page_vma_mapped.c
> > > @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > >  			pvmw->ptl = NULL;
> > >  		}
> > >  	} else if (!pmd_present(pmde)) {
> > > +		/*
> > > +		 * If PVMW_SYNC, take and drop THP pmd lock so that we
> > > +		 * cannot return prematurely, while zap_huge_pmd() has
> > > +		 * cleared *pmd but not decremented compound_mapcount().
> > > +		 */
> > > +		if ((pvmw->flags & PVMW_SYNC) &&
> > > +		    PageTransCompound(pvmw->page))
> > > +			spin_unlock(pmd_lock(mm, pvmw->pmd));
> > >  		return false;
> > >  	}
> > >  	if (!map_pte(pvmw))
> > 
> > Sorry if I missed something important, but I'm totally confused on how this
> > unlock is pairing with another lock()..
> 
> I imagine you're reading that as spin_unlock(pmd_lockptr(blah));
> no, the lock is right there, inside spin_unlock(pmd_lock(blah)).

Oh.. It made me scratch head too. Could you rewrite it in a more readable
way?
Yang Shi June 4, 2021, 6:24 p.m. UTC | #8
On Thu, Jun 3, 2021 at 7:45 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Thu, 3 Jun 2021, Yang Shi wrote:
> > On Tue, Jun 1, 2021 at 2:07 PM Hugh Dickins <hughd@google.com> wrote:
> > >
> > > Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
> > > follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
> > > to the options, and passing that from unmap_page() when CONFIG_DEBUG_VM=y.
> > > It could be passed in the non-debug case too, but that would sometimes add
> > > a little overhead, whereas it's rare for this race to result in failure.
> >
> > The above statement makes me feel this patch is just to relieve the
> > VM_BUG_ON, but my patch already changed it to VM_WARN, the race sounds
> > acceptable (at least not fatal) and the splitting code can handle the
> > failure case as well. So I'm wondering if we still need this patch or
> > not if it is just used to close the race when CONFIG_DEBUG_VM=y.
>
> I do agree that your 1/2 (what I'm calling 6.1/7) BUG->WARN patch
> is the most important of them; but it didn't get marked for stable,
> and has got placed behind conflicting mods never intended for stable.

Sorry for not marking it for stable. I do have no any objection to
having it in stable, just forgot to cc stable. :-(

>
> And a lot of the descriptions had been written in terms of the prior
> situation, with VM BUG there: it was easier to keep describing that way.

Understood.

>
> Whether your fix makes mine redundant is arguable (Wang Yugui thinks
> not).  It's easier to argue that it makes the racy ones (like this)
> redundant, than the persistent ones (like vma_address or pvm_walk).

The point is if we just close the race for DEBUG_VM case, it doesn't
sound worth it to me IMHO. How's about making it for !DEBUG_VM anyway?
How big the overhead could be? It looks like the heavy lifting (tlb
shootdown and page free) is done after pmd lock is released so it
should not block pvmw for a long time.

>
> Since I know of at least one customer who wants all these fixes in 5.10
> longterm, I'm fighting to get them that far at least.  But the further
> back they go, the less effort I'll make to backport them - will fall
> back to porting your BUG->WARN only.
>
> Hugh
Hugh Dickins June 4, 2021, 10:26 p.m. UTC | #9
On Fri, 4 Jun 2021, Peter Xu wrote:
> On Thu, Jun 03, 2021 at 07:54:11PM -0700, Hugh Dickins wrote:
> > On Thu, 3 Jun 2021, Peter Xu wrote:
> > > On Tue, Jun 01, 2021 at 02:07:53PM -0700, Hugh Dickins wrote:
> > > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > > index 2cf01d933f13..b45d22738b45 100644
> > > > --- a/mm/page_vma_mapped.c
> > > > +++ b/mm/page_vma_mapped.c
> > > > @@ -212,6 +212,14 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > > >  			pvmw->ptl = NULL;
> > > >  		}
> > > >  	} else if (!pmd_present(pmde)) {
> > > > +		/*
> > > > +		 * If PVMW_SYNC, take and drop THP pmd lock so that we
> > > > +		 * cannot return prematurely, while zap_huge_pmd() has
> > > > +		 * cleared *pmd but not decremented compound_mapcount().
> > > > +		 */
> > > > +		if ((pvmw->flags & PVMW_SYNC) &&
> > > > +		    PageTransCompound(pvmw->page))
> > > > +			spin_unlock(pmd_lock(mm, pvmw->pmd));
> > > >  		return false;
> > > >  	}
> > > >  	if (!map_pte(pvmw))
> > > 
> > > Sorry if I missed something important, but I'm totally confused on how this
> > > unlock is pairing with another lock()..
> > 
> > I imagine you're reading that as spin_unlock(pmd_lockptr(blah));
> > no, the lock is right there, inside spin_unlock(pmd_lock(blah)).
> 
> Heh, yeah... Sorry about that.

I'll expand that line, as Kirill asks too.

> 
> > 
> > > 
> > > And.. isn't PVMW_SYNC only meaningful for pte-level only (as I didn't see a
> > > reference of it outside map_pte)?
> > 
> > But you are pointing directly to its reference outside map_pte()!
> 
> Right, I was trying to look for the lock() so I needed to look at all the rest
> besides this one. :)
> 
> I didn't follow Yang's patch, but if Yang's patch can make kernel not crashing
> and fault handling done all well, then I'm kind of agree with him: having
> workaround code (like taking lock and quickly releasing..) only for debug code
> seems an overkill to me, not to mention that the debug code will be even more
> strict after this patch, as it means it's even less likely that one can
> reproduce one production host race with DEBUG_VM..  Normally debugging code
> would affect code execution already, and for this one we're enlarging that gap
> "explicitly" - not sure whether it's good.
> 
> This also makes me curious what if we make !DEBUG_VM strict too - how much perf
> we'll lose?  Haven't even tried to think about it with details, but just raise
> it up. Say, is there any chance we make the debug/non-debug paths run the same
> logic (e.g. of SYNC version)?

And Yang Shi suggests the same.

Yes, I'm not fond of doing that differently for DEBUG_VM or not;
but could never quite decide which way to jump.

For so long as we worry about whether split_huge_page() is working
correctly (and Wang Yugui still has a case that we have not solved),
we do want the warning; and for so long as we have the warning, we
do need the TTU_SYNC to prevent showing the warning unnecessarily.

How much overhead added by doing TTU_SYNC now on !DEBUG_VM?  On any
sensible anon THP case, I don't think it could add overhead at all.
But in some shmem cases (multiply mapped but sparsely populated,
populated differently by different tasks) it could add overhead:
dirtying lock cachelines in tasks which don't have the page mapped.

But we're only talking about huge page splitting, that should not
be #1 on anyone's performance list; and has all sorts of other
overheads of its own.  I think I'll go with your preference, and
make this TTU_SYNC for all.  We can easily revert to DEBUG_VM only
if some regression is noticed.

Hugh
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..891599a4cb8d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -97,7 +97,8 @@  enum ttu_flags {
 					 * do a final flush if necessary */
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
-	TTU_SPLIT_FREEZE	= 0x100,		/* freeze pte under splitting thp */
+	TTU_SPLIT_FREEZE	= 0x100, /* freeze pte under splitting thp */
+	TTU_SYNC		= 0x200, /* avoid racy checks with PVMW_SYNC */
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9fb7b47da87e..305f709a7aca 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2357,6 +2357,10 @@  static void unmap_page(struct page *page)
 	if (PageAnon(page))
 		ttu_flags |= TTU_SPLIT_FREEZE;
 
+	/* Make sure that the BUGs will not bite */
+	if (IS_ENABLED(CONFIG_DEBUG_VM))
+		ttu_flags |= TTU_SYNC;
+
 	unmap_success = try_to_unmap(page, ttu_flags);
 	VM_BUG_ON_PAGE(!unmap_success, page);
 }
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..b45d22738b45 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -212,6 +212,14 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			pvmw->ptl = NULL;
 		}
 	} else if (!pmd_present(pmde)) {
+		/*
+		 * If PVMW_SYNC, take and drop THP pmd lock so that we
+		 * cannot return prematurely, while zap_huge_pmd() has
+		 * cleared *pmd but not decremented compound_mapcount().
+		 */
+		if ((pvmw->flags & PVMW_SYNC) &&
+		    PageTransCompound(pvmw->page))
+			spin_unlock(pmd_lock(mm, pvmw->pmd));
 		return false;
 	}
 	if (!map_pte(pvmw))
diff --git a/mm/rmap.c b/mm/rmap.c
index 693a610e181d..07811b4ae793 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,6 +1405,15 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
+	/*
+	 * When racing against e.g. zap_pte_range() on another cpu,
+	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+	 * try_to_unmap() may return false when it is about to become true,
+	 * if page table locking is skipped: use TTU_SYNC to wait for that.
+	 */
+	if (flags & TTU_SYNC)
+		pvmw.flags = PVMW_SYNC;
+
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
 		return true;
@@ -1777,7 +1786,13 @@  bool try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		rmap_walk(page, &rwc);
 
-	return !page_mapcount(page) ? true : false;
+	/*
+	 * When racing against e.g. zap_pte_range() on another cpu,
+	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+	 * try_to_unmap() may return false when it is about to become true,
+	 * if page table locking is skipped: use TTU_SYNC to wait for that.
+	 */
+	return !page_mapcount(page);
 }
 
 /**