diff mbox series

[RFC,07/14] mm/khugepaged: add vm_flags_ignore to hugepage_vma_revalidate_pmd_count()

Message ID 20220308213417.1407042-8-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe March 8, 2022, 9:34 p.m. UTC
In madvise collapse context, we optionally want to be able to ignore
advice from MADV_NOHUGEPAGE-marked regions.

Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
which can be used to ignore vm flags used when considering thp
eligibility.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

Comments

Yang Shi March 9, 2022, 11:17 p.m. UTC | #1
On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> In madvise collapse context, we optionally want to be able to ignore
> advice from MADV_NOHUGEPAGE-marked regions.

Could you please elaborate why this usecase is valid? Typically
MADV_NOHUGEPAGE is set when the users really don't want to have THP
for this area. So it doesn't make too much sense to ignore it IMHO.

>
> Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> which can be used to ignore vm flags used when considering thp
> eligibility.
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> ---
>  mm/khugepaged.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1d20be47bcea..ecbd3fc41c80 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>  #endif
>
>  /*
> - * Revalidate a vma's eligibility to collapse nr hugepages.
> + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> + * can be used to ignore certain vma_flags that would otherwise be checked -
> + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> + * collapse context.
>   */
>  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
>                                              unsigned long address, int nr,
> +                                            unsigned long vm_flags_ignore,
>                                              struct vm_area_struct **vmap)
>  {
>         struct vm_area_struct *vma;
> @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
>         hend = vma->vm_end & HPAGE_PMD_MASK;
>         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
>                 return SCAN_ADDRESS_RANGE;
> -       if (!hugepage_vma_check(vma, vma->vm_flags))
> +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
>                 return SCAN_VMA_CHECK;
>         /* Anon VMA expected */
>         if (!vma->anon_vma || vma->vm_ops)
> @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
>   */
>
>  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> +                                  unsigned long vm_flags_ignore,
>                                    struct vm_area_struct **vmap)
>  {
> -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> +                       vm_flags_ignore, vmap);
>  }
>
>  /*
> @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
>                 if (ret & VM_FAULT_RETRY) {
>                         mmap_read_lock(mm);
> -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
>                                 /* vma is no longer available, don't continue to swapin */
>                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
>                                 return false;
> @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
>
>         mmap_read_lock(mm);
> -       result = hugepage_vma_revalidate(mm, address, &vma);
> +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
>         if (result) {
>                 mmap_read_unlock(mm);
>                 goto out_nolock;
> @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>          */
>         mmap_write_lock(mm);
>
> -       result = hugepage_vma_revalidate(mm, address, &vma);
> +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
>         if (result)
>                 goto out_up_write;
>         /* check if the pmd is still valid */
> --
> 2.35.1.616.g0bdcbb4464-goog
>
Zach O'Keefe March 10, 2022, midnight UTC | #2
> On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > In madvise collapse context, we optionally want to be able to ignore
> > advice from MADV_NOHUGEPAGE-marked regions.
>
> Could you please elaborate why this usecase is valid? Typically
> MADV_NOHUGEPAGE is set when the users really don't want to have THP
> for this area. So it doesn't make too much sense to ignore it IMHO.
>

Hey Yang, thanks for taking time to review and comment.

Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
the user to say "I don't want hugepages here", so that the kernel
knows not to do so when faulting memory, and khugepaged can stay away.
However, in MADV_COLLAPSE, the user is explicitly requesting this be
backed by hugepages - so presumably that is exactly what they want.

IOW, if the user didn't want this memory to be backed by hugepages,
they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
the user wanted collapsed, but that had some sub-areas marked
MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
operations around the excluded regions.

In terms of use cases, I don't have a concrete example, but a user
could hypothetically choose to exclude regions from management from
khugepaged, but still be able to collapse the memory themselves,
when/if they deem appropriate.

> >
> > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > which can be used to ignore vm flags used when considering thp
> > eligibility.
> >
> > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > ---
> >  mm/khugepaged.c | 18 ++++++++++++------
> >  1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 1d20be47bcea..ecbd3fc41c80 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> >  #endif
> >
> >  /*
> > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > + * collapse context.
> >   */
> >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> >                                              unsigned long address, int nr,
> > +                                            unsigned long vm_flags_ignore,
> >                                              struct vm_area_struct **vmap)
> >  {
> >         struct vm_area_struct *vma;
> > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> >         hend = vma->vm_end & HPAGE_PMD_MASK;
> >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> >                 return SCAN_ADDRESS_RANGE;
> > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> >                 return SCAN_VMA_CHECK;
> >         /* Anon VMA expected */
> >         if (!vma->anon_vma || vma->vm_ops)
> > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> >   */
> >
> >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > +                                  unsigned long vm_flags_ignore,
> >                                    struct vm_area_struct **vmap)
> >  {
> > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > +                       vm_flags_ignore, vmap);
> >  }
> >
> >  /*
> > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> >                 if (ret & VM_FAULT_RETRY) {
> >                         mmap_read_lock(mm);
> > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> >                                 /* vma is no longer available, don't continue to swapin */
> >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> >                                 return false;
> > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> >
> >         mmap_read_lock(mm);
> > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> >         if (result) {
> >                 mmap_read_unlock(mm);
> >                 goto out_nolock;
> > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> >          */
> >         mmap_write_lock(mm);
> >
> > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> >         if (result)
> >                 goto out_up_write;
> >         /* check if the pmd is still valid */
> > --
> > 2.35.1.616.g0bdcbb4464-goog
> >
Yang Shi March 10, 2022, 12:41 a.m. UTC | #3
On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > >
> > > In madvise collapse context, we optionally want to be able to ignore
> > > advice from MADV_NOHUGEPAGE-marked regions.
> >
> > Could you please elaborate why this usecase is valid? Typically
> > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > for this area. So it doesn't make too much sense to ignore it IMHO.
> >
>
> Hey Yang, thanks for taking time to review and comment.
>
> Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> the user to say "I don't want hugepages here", so that the kernel
> knows not to do so when faulting memory, and khugepaged can stay away.
> However, in MADV_COLLAPSE, the user is explicitly requesting this be
> backed by hugepages - so presumably that is exactly what they want.
>
> IOW, if the user didn't want this memory to be backed by hugepages,
> they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> the user wanted collapsed, but that had some sub-areas marked
> MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> operations around the excluded regions.
>
> In terms of use cases, I don't have a concrete example, but a user
> could hypothetically choose to exclude regions from management from
> khugepaged, but still be able to collapse the memory themselves,
> when/if they deem appropriate.

I see. It seems you thought MADV_COLLAPSE actually unsets
VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
right away, right? To some degree, it makes some sense. If this is the
behavior you'd like to achieve, I'd suggest making it more explicit,
for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
than ignore or change vm flags silently. When using madvise mode, but
not having VM_HUGEPAGE set, the vma check should fail in the current
code (I didn't look hard if you already covered this or not).

>
> > >
> > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > which can be used to ignore vm flags used when considering thp
> > > eligibility.
> > >
> > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > ---
> > >  mm/khugepaged.c | 18 ++++++++++++------
> > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > >  #endif
> > >
> > >  /*
> > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > + * collapse context.
> > >   */
> > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > >                                              unsigned long address, int nr,
> > > +                                            unsigned long vm_flags_ignore,
> > >                                              struct vm_area_struct **vmap)
> > >  {
> > >         struct vm_area_struct *vma;
> > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > >                 return SCAN_ADDRESS_RANGE;
> > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > >                 return SCAN_VMA_CHECK;
> > >         /* Anon VMA expected */
> > >         if (!vma->anon_vma || vma->vm_ops)
> > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > >   */
> > >
> > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > +                                  unsigned long vm_flags_ignore,
> > >                                    struct vm_area_struct **vmap)
> > >  {
> > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > +                       vm_flags_ignore, vmap);
> > >  }
> > >
> > >  /*
> > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > >                 if (ret & VM_FAULT_RETRY) {
> > >                         mmap_read_lock(mm);
> > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > >                                 /* vma is no longer available, don't continue to swapin */
> > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > >                                 return false;
> > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > >
> > >         mmap_read_lock(mm);
> > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > >         if (result) {
> > >                 mmap_read_unlock(mm);
> > >                 goto out_nolock;
> > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > >          */
> > >         mmap_write_lock(mm);
> > >
> > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > >         if (result)
> > >                 goto out_up_write;
> > >         /* check if the pmd is still valid */
> > > --
> > > 2.35.1.616.g0bdcbb4464-goog
> > >
Zach O'Keefe March 10, 2022, 1:09 a.m. UTC | #4
On Wed, Mar 9, 2022 at 4:41 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > >
> > > > In madvise collapse context, we optionally want to be able to ignore
> > > > advice from MADV_NOHUGEPAGE-marked regions.
> > >
> > > Could you please elaborate why this usecase is valid? Typically
> > > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > > for this area. So it doesn't make too much sense to ignore it IMHO.
> > >
> >
> > Hey Yang, thanks for taking time to review and comment.
> >
> > Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> > the user to say "I don't want hugepages here", so that the kernel
> > knows not to do so when faulting memory, and khugepaged can stay away.
> > However, in MADV_COLLAPSE, the user is explicitly requesting this be
> > backed by hugepages - so presumably that is exactly what they want.
> >
> > IOW, if the user didn't want this memory to be backed by hugepages,
> > they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> > the user wanted collapsed, but that had some sub-areas marked
> > MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> > operations around the excluded regions.
> >
> > In terms of use cases, I don't have a concrete example, but a user
> > could hypothetically choose to exclude regions from management from
> > khugepaged, but still be able to collapse the memory themselves,
> > when/if they deem appropriate.
>
> I see. It seems you thought MADV_COLLAPSE actually unsets
> VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
> right away, right? To some degree, it makes some sense.

Currently, MADV_COLLAPSE doesn't alter the vma flags at all - it just
ignores VM_NOHUGEPAGE, and so it's not really the same as
MADV_HUGEPAGE + MADV_COLLAPSE (which would set VM_HUGEPAGE in addition
to clearing VM_NOHUGEPAGE). If my use case has any merit (and I'm not
sure it does) then we don't want to be altering the vma flags since we
don't want to touch khugepaged behavior.

> If this is the
> behavior you'd like to achieve, I'd suggest making it more explicit,
> for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
> than ignore or change vm flags silently. When using madvise mode, but
> not having VM_HUGEPAGE set, the vma check should fail in the current
> code (I didn't look hard if you already covered this or not).
>

You're correct, this will fail, since it's following the same
semantics as the fault path. I see what you're saying though; that
perhaps this is inconsistent with my above reasoning that "the user
asked to collapse this memory, and so we should do it". If so, then
perhaps MADV_COLLAPSE just ignores madise mode and VM_[NO]HUGEPAGE
entirely for the purposes of eligibility, and only uses it for the
purposes of determining gfp flags for compaction/reclaim. Pushing that
further, compaction/reclaim could entirely be specified by the user
using a process_madvise(2) flag (later in the series, we do something
like this).


> >
> > > >
> > > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > > which can be used to ignore vm flags used when considering thp
> > > > eligibility.
> > > >
> > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > ---
> > > >  mm/khugepaged.c | 18 ++++++++++++------
> > > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > >  #endif
> > > >
> > > >  /*
> > > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > > + * collapse context.
> > > >   */
> > > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > >                                              unsigned long address, int nr,
> > > > +                                            unsigned long vm_flags_ignore,
> > > >                                              struct vm_area_struct **vmap)
> > > >  {
> > > >         struct vm_area_struct *vma;
> > > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > > >                 return SCAN_ADDRESS_RANGE;
> > > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > > >                 return SCAN_VMA_CHECK;
> > > >         /* Anon VMA expected */
> > > >         if (!vma->anon_vma || vma->vm_ops)
> > > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > >   */
> > > >
> > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > +                                  unsigned long vm_flags_ignore,
> > > >                                    struct vm_area_struct **vmap)
> > > >  {
> > > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > > +                       vm_flags_ignore, vmap);
> > > >  }
> > > >
> > > >  /*
> > > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > > >                 if (ret & VM_FAULT_RETRY) {
> > > >                         mmap_read_lock(mm);
> > > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > > >                                 /* vma is no longer available, don't continue to swapin */
> > > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > > >                                 return false;
> > > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > > >
> > > >         mmap_read_lock(mm);
> > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > >         if (result) {
> > > >                 mmap_read_unlock(mm);
> > > >                 goto out_nolock;
> > > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > >          */
> > > >         mmap_write_lock(mm);
> > > >
> > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > >         if (result)
> > > >                 goto out_up_write;
> > > >         /* check if the pmd is still valid */
> > > > --
> > > > 2.35.1.616.g0bdcbb4464-goog
> > > >
Yang Shi March 10, 2022, 2:16 a.m. UTC | #5
On Wed, Mar 9, 2022 at 5:10 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Wed, Mar 9, 2022 at 4:41 PM Yang Shi <shy828301@gmail.com> wrote:
> >
> > On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > >
> > > > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > >
> > > > > In madvise collapse context, we optionally want to be able to ignore
> > > > > advice from MADV_NOHUGEPAGE-marked regions.
> > > >
> > > > Could you please elaborate why this usecase is valid? Typically
> > > > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > > > for this area. So it doesn't make too much sense to ignore it IMHO.
> > > >
> > >
> > > Hey Yang, thanks for taking time to review and comment.
> > >
> > > Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> > > the user to say "I don't want hugepages here", so that the kernel
> > > knows not to do so when faulting memory, and khugepaged can stay away.
> > > However, in MADV_COLLAPSE, the user is explicitly requesting this be
> > > backed by hugepages - so presumably that is exactly what they want.
> > >
> > > IOW, if the user didn't want this memory to be backed by hugepages,
> > > they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> > > the user wanted collapsed, but that had some sub-areas marked
> > > MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> > > operations around the excluded regions.
> > >
> > > In terms of use cases, I don't have a concrete example, but a user
> > > could hypothetically choose to exclude regions from management from
> > > khugepaged, but still be able to collapse the memory themselves,
> > > when/if they deem appropriate.
> >
> > I see. It seems you thought MADV_COLLAPSE actually unsets
> > VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
> > right away, right? To some degree, it makes some sense.
>
> Currently, MADV_COLLAPSE doesn't alter the vma flags at all - it just
> ignores VM_NOHUGEPAGE, and so it's not really the same as
> MADV_HUGEPAGE + MADV_COLLAPSE (which would set VM_HUGEPAGE in addition
> to clearing VM_NOHUGEPAGE). If my use case has any merit (and I'm not
> sure it does) then we don't want to be altering the vma flags since we
> don't want to touch khugepaged behavior.
>
> > If this is the
> > behavior you'd like to achieve, I'd suggest making it more explicit,
> > for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
> > than ignore or change vm flags silently. When using madvise mode, but
> > not having VM_HUGEPAGE set, the vma check should fail in the current
> > code (I didn't look hard if you already covered this or not).
> >
>
> You're correct, this will fail, since it's following the same
> semantics as the fault path. I see what you're saying though; that
> perhaps this is inconsistent with my above reasoning that "the user
> asked to collapse this memory, and so we should do it". If so, then
> perhaps MADV_COLLAPSE just ignores madise mode and VM_[NO]HUGEPAGE
> entirely for the purposes of eligibility, and only uses it for the
> purposes of determining gfp flags for compaction/reclaim. Pushing that
> further, compaction/reclaim could entirely be specified by the user
> using a process_madvise(2) flag (later in the series, we do something
> like this).

Anyway I think we could have two options for MADV_COLLAPSE:

1. Just treat it as a hint (nice to have, best effort). It should obey
all the settings. Skip VM_NOHUGEPAGE vmas or vmas without VM_HUGEPAGE
if madvise mode, etc.

2. Much stronger. It equals MADV_HUGEPAGE + synchronous collapse. It
should set vma flags properly as I suggested.

Either is fine to me. But I don't prefer something in between personally.

>
>
> > >
> > > > >
> > > > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > > > which can be used to ignore vm flags used when considering thp
> > > > > eligibility.
> > > > >
> > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > > ---
> > > > >  mm/khugepaged.c | 18 ++++++++++++------
> > > > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > > >  #endif
> > > > >
> > > > >  /*
> > > > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > > > + * collapse context.
> > > > >   */
> > > > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > >                                              unsigned long address, int nr,
> > > > > +                                            unsigned long vm_flags_ignore,
> > > > >                                              struct vm_area_struct **vmap)
> > > > >  {
> > > > >         struct vm_area_struct *vma;
> > > > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > > > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > > > >                 return SCAN_ADDRESS_RANGE;
> > > > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > > > >                 return SCAN_VMA_CHECK;
> > > > >         /* Anon VMA expected */
> > > > >         if (!vma->anon_vma || vma->vm_ops)
> > > > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > >   */
> > > > >
> > > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > > +                                  unsigned long vm_flags_ignore,
> > > > >                                    struct vm_area_struct **vmap)
> > > > >  {
> > > > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > > > +                       vm_flags_ignore, vmap);
> > > > >  }
> > > > >
> > > > >  /*
> > > > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > > > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > > > >                 if (ret & VM_FAULT_RETRY) {
> > > > >                         mmap_read_lock(mm);
> > > > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > > > >                                 /* vma is no longer available, don't continue to swapin */
> > > > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > > > >                                 return false;
> > > > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > > > >
> > > > >         mmap_read_lock(mm);
> > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > >         if (result) {
> > > > >                 mmap_read_unlock(mm);
> > > > >                 goto out_nolock;
> > > > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > >          */
> > > > >         mmap_write_lock(mm);
> > > > >
> > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > >         if (result)
> > > > >                 goto out_up_write;
> > > > >         /* check if the pmd is still valid */
> > > > > --
> > > > > 2.35.1.616.g0bdcbb4464-goog
> > > > >
Zach O'Keefe March 10, 2022, 3:50 p.m. UTC | #6
On Wed, Mar 9, 2022 at 6:16 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Mar 9, 2022 at 5:10 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > On Wed, Mar 9, 2022 at 4:41 PM Yang Shi <shy828301@gmail.com> wrote:
> > >
> > > On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > >
> > > > > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > > >
> > > > > > In madvise collapse context, we optionally want to be able to ignore
> > > > > > advice from MADV_NOHUGEPAGE-marked regions.
> > > > >
> > > > > Could you please elaborate why this usecase is valid? Typically
> > > > > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > > > > for this area. So it doesn't make too much sense to ignore it IMHO.
> > > > >
> > > >
> > > > Hey Yang, thanks for taking time to review and comment.
> > > >
> > > > Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> > > > the user to say "I don't want hugepages here", so that the kernel
> > > > knows not to do so when faulting memory, and khugepaged can stay away.
> > > > However, in MADV_COLLAPSE, the user is explicitly requesting this be
> > > > backed by hugepages - so presumably that is exactly what they want.
> > > >
> > > > IOW, if the user didn't want this memory to be backed by hugepages,
> > > > they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> > > > the user wanted collapsed, but that had some sub-areas marked
> > > > MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> > > > operations around the excluded regions.
> > > >
> > > > In terms of use cases, I don't have a concrete example, but a user
> > > > could hypothetically choose to exclude regions from management from
> > > > khugepaged, but still be able to collapse the memory themselves,
> > > > when/if they deem appropriate.
> > >
> > > I see. It seems you thought MADV_COLLAPSE actually unsets
> > > VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
> > > right away, right? To some degree, it makes some sense.
> >
> > Currently, MADV_COLLAPSE doesn't alter the vma flags at all - it just
> > ignores VM_NOHUGEPAGE, and so it's not really the same as
> > MADV_HUGEPAGE + MADV_COLLAPSE (which would set VM_HUGEPAGE in addition
> > to clearing VM_NOHUGEPAGE). If my use case has any merit (and I'm not
> > sure it does) then we don't want to be altering the vma flags since we
> > don't want to touch khugepaged behavior.
> >
> > > If this is the
> > > behavior you'd like to achieve, I'd suggest making it more explicit,
> > > for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
> > > than ignore or change vm flags silently. When using madvise mode, but
> > > not having VM_HUGEPAGE set, the vma check should fail in the current
> > > code (I didn't look hard if you already covered this or not).
> > >
> >
> > You're correct, this will fail, since it's following the same
> > semantics as the fault path. I see what you're saying though; that
> > perhaps this is inconsistent with my above reasoning that "the user
> > asked to collapse this memory, and so we should do it". If so, then
> > perhaps MADV_COLLAPSE just ignores madise mode and VM_[NO]HUGEPAGE
> > entirely for the purposes of eligibility, and only uses it for the
> > purposes of determining gfp flags for compaction/reclaim. Pushing that
> > further, compaction/reclaim could entirely be specified by the user
> > using a process_madvise(2) flag (later in the series, we do something
> > like this).
>
> Anyway I think we could have two options for MADV_COLLAPSE:
>
> 1. Just treat it as a hint (nice to have, best effort). It should obey
> all the settings. Skip VM_NOHUGEPAGE vmas or vmas without VM_HUGEPAGE
> if madvise mode, etc.
>
> 2. Much stronger. It equals MADV_HUGEPAGE + synchronous collapse. It
> should set vma flags properly as I suggested.
>
> Either is fine to me. But I don't prefer something in between personally.
>

Makes sense to be consistent. Of these, #1 seems the most
straightforward to use. Doing an MADV_COLLAPSE on a VM_NOHUGEPAGE vma
seems like a corner case. The more likely scenario is MADV_COLLAPSE on
an unflagged (neither VM_HUGEPAGE or VM_NOHUGEPAGE) vma - in which
case it's less intrusive to not additionally set VM_HUGEPAGE (though
the user can always do so if they wish). It's a little more consistent
with "always" mode, where MADV_HUGEPAGE isn't necessary for
eligibility. It'll also reduce some code complexity.

I'll float one last option your way, however:

3. The collapsed region is always eligible, regardless of vma flags or
thp settings (except "never"?). For process_madvise(2), a flag will
explicitly specify defrag semantics.

This separates "async-hint" vs "sync-explicit" madvise requests.
MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
the kernel how to treat memory in the future. The kernel uses
VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
request, is free to define its own defrag semantics.

This would allow flexibility to separately define async vs sync thp
policies. For example, highly tuned userspace applications that are
sensitive to unexpected latency might want to manage their hugepages
utilization themselves, and ask khugepaged to stay away. There is no
way in "always" mode to do this without setting VM_NOHUGEPAGE.

> >
> >
> > > >
> > > > > >
> > > > > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > > > > which can be used to ignore vm flags used when considering thp
> > > > > > eligibility.
> > > > > >
> > > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > > > ---
> > > > > >  mm/khugepaged.c | 18 ++++++++++++------
> > > > > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > > > > --- a/mm/khugepaged.c
> > > > > > +++ b/mm/khugepaged.c
> > > > > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > > > >  #endif
> > > > > >
> > > > > >  /*
> > > > > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > > > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > > > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > > > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > > > > + * collapse context.
> > > > > >   */
> > > > > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > >                                              unsigned long address, int nr,
> > > > > > +                                            unsigned long vm_flags_ignore,
> > > > > >                                              struct vm_area_struct **vmap)
> > > > > >  {
> > > > > >         struct vm_area_struct *vma;
> > > > > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > > > > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > > > > >                 return SCAN_ADDRESS_RANGE;
> > > > > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > > > > >                 return SCAN_VMA_CHECK;
> > > > > >         /* Anon VMA expected */
> > > > > >         if (!vma->anon_vma || vma->vm_ops)
> > > > > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > >   */
> > > > > >
> > > > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > > > +                                  unsigned long vm_flags_ignore,
> > > > > >                                    struct vm_area_struct **vmap)
> > > > > >  {
> > > > > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > > > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > > > > +                       vm_flags_ignore, vmap);
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > > > > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > > > > >                 if (ret & VM_FAULT_RETRY) {
> > > > > >                         mmap_read_lock(mm);
> > > > > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > > > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > > > > >                                 /* vma is no longer available, don't continue to swapin */
> > > > > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > > > > >                                 return false;
> > > > > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > > > > >
> > > > > >         mmap_read_lock(mm);
> > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > >         if (result) {
> > > > > >                 mmap_read_unlock(mm);
> > > > > >                 goto out_nolock;
> > > > > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > >          */
> > > > > >         mmap_write_lock(mm);
> > > > > >
> > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > >         if (result)
> > > > > >                 goto out_up_write;
> > > > > >         /* check if the pmd is still valid */
> > > > > > --
> > > > > > 2.35.1.616.g0bdcbb4464-goog
> > > > > >
David Hildenbrand March 10, 2022, 3:56 p.m. UTC | #7
On 08.03.22 22:34, Zach O'Keefe wrote:
> In madvise collapse context, we optionally want to be able to ignore
> advice from MADV_NOHUGEPAGE-marked regions.
> 
> Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> which can be used to ignore vm flags used when considering thp
> eligibility.

arch/s390/mm/gmap.c:thp_split_mm() sets VM_NOHUGEPAGE to make sure there
are *really* no thp. Being able to bypass that would break KVM horribly.

Ignoring MADV_NOHUGEPAGE/VM_NOHUGEPAGE feels like the wrong way to go.


What about a prctl instead, to disable any khugepagd activity and just
let that process control it manually?
Yang Shi March 10, 2022, 6:17 p.m. UTC | #8
On Thu, Mar 10, 2022 at 7:51 AM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Wed, Mar 9, 2022 at 6:16 PM Yang Shi <shy828301@gmail.com> wrote:
> >
> > On Wed, Mar 9, 2022 at 5:10 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > >
> > > On Wed, Mar 9, 2022 at 4:41 PM Yang Shi <shy828301@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > >
> > > > > > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > > > >
> > > > > > > In madvise collapse context, we optionally want to be able to ignore
> > > > > > > advice from MADV_NOHUGEPAGE-marked regions.
> > > > > >
> > > > > > Could you please elaborate why this usecase is valid? Typically
> > > > > > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > > > > > for this area. So it doesn't make too much sense to ignore it IMHO.
> > > > > >
> > > > >
> > > > > Hey Yang, thanks for taking time to review and comment.
> > > > >
> > > > > Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> > > > > the user to say "I don't want hugepages here", so that the kernel
> > > > > knows not to do so when faulting memory, and khugepaged can stay away.
> > > > > However, in MADV_COLLAPSE, the user is explicitly requesting this be
> > > > > backed by hugepages - so presumably that is exactly what they want.
> > > > >
> > > > > IOW, if the user didn't want this memory to be backed by hugepages,
> > > > > they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> > > > > the user wanted collapsed, but that had some sub-areas marked
> > > > > MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> > > > > operations around the excluded regions.
> > > > >
> > > > > In terms of use cases, I don't have a concrete example, but a user
> > > > > could hypothetically choose to exclude regions from management from
> > > > > khugepaged, but still be able to collapse the memory themselves,
> > > > > when/if they deem appropriate.
> > > >
> > > > I see. It seems you thought MADV_COLLAPSE actually unsets
> > > > VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
> > > > right away, right? To some degree, it makes some sense.
> > >
> > > Currently, MADV_COLLAPSE doesn't alter the vma flags at all - it just
> > > ignores VM_NOHUGEPAGE, and so it's not really the same as
> > > MADV_HUGEPAGE + MADV_COLLAPSE (which would set VM_HUGEPAGE in addition
> > > to clearing VM_NOHUGEPAGE). If my use case has any merit (and I'm not
> > > sure it does) then we don't want to be altering the vma flags since we
> > > don't want to touch khugepaged behavior.
> > >
> > > > If this is the
> > > > behavior you'd like to achieve, I'd suggest making it more explicit,
> > > > for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
> > > > than ignore or change vm flags silently. When using madvise mode, but
> > > > not having VM_HUGEPAGE set, the vma check should fail in the current
> > > > code (I didn't look hard if you already covered this or not).
> > > >
> > >
> > > You're correct, this will fail, since it's following the same
> > > semantics as the fault path. I see what you're saying though; that
> > > perhaps this is inconsistent with my above reasoning that "the user
> > > asked to collapse this memory, and so we should do it". If so, then
> > > perhaps MADV_COLLAPSE just ignores madise mode and VM_[NO]HUGEPAGE
> > > entirely for the purposes of eligibility, and only uses it for the
> > > purposes of determining gfp flags for compaction/reclaim. Pushing that
> > > further, compaction/reclaim could entirely be specified by the user
> > > using a process_madvise(2) flag (later in the series, we do something
> > > like this).
> >
> > Anyway I think we could have two options for MADV_COLLAPSE:
> >
> > 1. Just treat it as a hint (nice to have, best effort). It should obey
> > all the settings. Skip VM_NOHUGEPAGE vmas or vmas without VM_HUGEPAGE
> > if madvise mode, etc.
> >
> > 2. Much stronger. It equals MADV_HUGEPAGE + synchronous collapse. It
> > should set vma flags properly as I suggested.
> >
> > Either is fine to me. But I don't prefer something in between personally.
> >
>
> Makes sense to be consistent. Of these, #1 seems the most
> straightforward to use. Doing an MADV_COLLAPSE on a VM_NOHUGEPAGE vma
> seems like a corner case. The more likely scenario is MADV_COLLAPSE on
> an unflagged (neither VM_HUGEPAGE or VM_NOHUGEPAGE) vma - in which
> case it's less intrusive to not additionally set VM_HUGEPAGE (though
> the user can always do so if they wish). It's a little more consistent
> with "always" mode, where MADV_HUGEPAGE isn't necessary for
> eligibility. It'll also reduce some code complexity.
>
> I'll float one last option your way, however:
>
> 3. The collapsed region is always eligible, regardless of vma flags or
> thp settings (except "never"?). For process_madvise(2), a flag will
> explicitly specify defrag semantics.

This is what I meant for #2 IIUC. Defrag could follow the system's
defrag setting rather than the khugepaged's.

But it may break s390 as David pointed out.

>
> This separates "async-hint" vs "sync-explicit" madvise requests.
> MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> the kernel how to treat memory in the future. The kernel uses
> VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> request, is free to define its own defrag semantics.
>
> This would allow flexibility to separately define async vs sync thp
> policies. For example, highly tuned userspace applications that are
> sensitive to unexpected latency might want to manage their hugepages
> utilization themselves, and ask khugepaged to stay away. There is no
> way in "always" mode to do this without setting VM_NOHUGEPAGE.

I don't quite get why you set THP to always but don't want to
khugepaged do its job. It may be slow, I think this is why you
introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
scan the same area, it just doesn't do any real work and waste some
cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
being split, right? So khugepaged still plays a role to re-collapse
the area without calling MADV_COLLAPSE over again and again.

>
> > >
> > >
> > > > >
> > > > > > >
> > > > > > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > > > > > which can be used to ignore vm flags used when considering thp
> > > > > > > eligibility.
> > > > > > >
> > > > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > > > > ---
> > > > > > >  mm/khugepaged.c | 18 ++++++++++++------
> > > > > > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > > > > > >
> > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > > > > > --- a/mm/khugepaged.c
> > > > > > > +++ b/mm/khugepaged.c
> > > > > > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > > > > >  #endif
> > > > > > >
> > > > > > >  /*
> > > > > > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > > > > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > > > > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > > > > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > > > > > + * collapse context.
> > > > > > >   */
> > > > > > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > >                                              unsigned long address, int nr,
> > > > > > > +                                            unsigned long vm_flags_ignore,
> > > > > > >                                              struct vm_area_struct **vmap)
> > > > > > >  {
> > > > > > >         struct vm_area_struct *vma;
> > > > > > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > > > > > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > > > > > >                 return SCAN_ADDRESS_RANGE;
> > > > > > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > > > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > > > > > >                 return SCAN_VMA_CHECK;
> > > > > > >         /* Anon VMA expected */
> > > > > > >         if (!vma->anon_vma || vma->vm_ops)
> > > > > > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > >   */
> > > > > > >
> > > > > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > > > > +                                  unsigned long vm_flags_ignore,
> > > > > > >                                    struct vm_area_struct **vmap)
> > > > > > >  {
> > > > > > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > > > > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > > > > > +                       vm_flags_ignore, vmap);
> > > > > > >  }
> > > > > > >
> > > > > > >  /*
> > > > > > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > > > > > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > > > > > >                 if (ret & VM_FAULT_RETRY) {
> > > > > > >                         mmap_read_lock(mm);
> > > > > > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > > > > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > > > > > >                                 /* vma is no longer available, don't continue to swapin */
> > > > > > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > > > > > >                                 return false;
> > > > > > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > > > > > >
> > > > > > >         mmap_read_lock(mm);
> > > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > > >         if (result) {
> > > > > > >                 mmap_read_unlock(mm);
> > > > > > >                 goto out_nolock;
> > > > > > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > > >          */
> > > > > > >         mmap_write_lock(mm);
> > > > > > >
> > > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > > >         if (result)
> > > > > > >                 goto out_up_write;
> > > > > > >         /* check if the pmd is still valid */
> > > > > > > --
> > > > > > > 2.35.1.616.g0bdcbb4464-goog
> > > > > > >
Zach O'Keefe March 10, 2022, 6:39 p.m. UTC | #9
On Thu, Mar 10, 2022 at 7:56 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.03.22 22:34, Zach O'Keefe wrote:
> > In madvise collapse context, we optionally want to be able to ignore
> > advice from MADV_NOHUGEPAGE-marked regions.
> >
> > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > which can be used to ignore vm flags used when considering thp
> > eligibility.
>
> arch/s390/mm/gmap.c:thp_split_mm() sets VM_NOHUGEPAGE to make sure there
> are *really* no thp. Being able to bypass that would break KVM horribly.
>
> Ignoring MADV_NOHUGEPAGE/VM_NOHUGEPAGE feels like the wrong way to go.
>
>
> What about a prctl instead, to disable any khugepagd activity and just
> let that process control it manually?
>
> --
> Thanks,
>
> David / dhildenb
>

Hey David - thanks for reviewing and commenting, and certainly thank
you for finding this bug.

This seems like good motivation for not ignoring VM_NOHUGEPAGE.
arch/powerpc also uses this flag for its own purposes (though it's
user-directed in that particular case).

prctrl(2) sounds very reasonable to me - thanks for the suggestion!

Thanks,
Zach
David Rientjes March 10, 2022, 6:46 p.m. UTC | #10
On Thu, 10 Mar 2022, Yang Shi wrote:

> > This separates "async-hint" vs "sync-explicit" madvise requests.
> > MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> > the kernel how to treat memory in the future. The kernel uses
> > VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> > request, is free to define its own defrag semantics.
> >
> > This would allow flexibility to separately define async vs sync thp
> > policies. For example, highly tuned userspace applications that are
> > sensitive to unexpected latency might want to manage their hugepages
> > utilization themselves, and ask khugepaged to stay away. There is no
> > way in "always" mode to do this without setting VM_NOHUGEPAGE.
> 
> I don't quite get why you set THP to always but don't want to
> khugepaged do its job. It may be slow, I think this is why you
> introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
> scan the same area, it just doesn't do any real work and waste some
> cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
> being split, right? So khugepaged still plays a role to re-collapse
> the area without calling MADV_COLLAPSE over again and again.
> 

My only real concern for MADV_COLLAPSE was when the span being collapsed 
includes a mixture of both VM_HUGEPAGE and VM_NOHUGEPAGE.  Does this 
collapse over the eligible memory or does it fail entirely?

I'd think it was the former, that we should respect VM_NOHUGEPAGE and only 
collapse eligible memory when doing MADV_COLLAPSE but now userspace 
struggles to know whether it was a partial collapse because of 
ineligiblity or because we just couldn't allocate a hugepage.

It has the information to figure this out on its own, so given the use of 
VM_NOHUGEPAGE for non-MADV_NOHUGEPAGE purposes, I think it makes sense to 
simply ignore these vmas as part of the collapse request.
Zach O'Keefe March 10, 2022, 6:53 p.m. UTC | #11
On Thu, Mar 10, 2022 at 10:17 AM Yang Shi <shy828301@gmail.com> wrote:
>
> On Thu, Mar 10, 2022 at 7:51 AM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > On Wed, Mar 9, 2022 at 6:16 PM Yang Shi <shy828301@gmail.com> wrote:
> > >
> > > On Wed, Mar 9, 2022 at 5:10 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > >
> > > > On Wed, Mar 9, 2022 at 4:41 PM Yang Shi <shy828301@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 9, 2022 at 4:01 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > > >
> > > > > > > On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
> > > > > > > >
> > > > > > > > In madvise collapse context, we optionally want to be able to ignore
> > > > > > > > advice from MADV_NOHUGEPAGE-marked regions.
> > > > > > >
> > > > > > > Could you please elaborate why this usecase is valid? Typically
> > > > > > > MADV_NOHUGEPAGE is set when the users really don't want to have THP
> > > > > > > for this area. So it doesn't make too much sense to ignore it IMHO.
> > > > > > >
> > > > > >
> > > > > > Hey Yang, thanks for taking time to review and comment.
> > > > > >
> > > > > > Semantically, the way I see it, is that MADV_NOHUGEPAGE is a way for
> > > > > > the user to say "I don't want hugepages here", so that the kernel
> > > > > > knows not to do so when faulting memory, and khugepaged can stay away.
> > > > > > However, in MADV_COLLAPSE, the user is explicitly requesting this be
> > > > > > backed by hugepages - so presumably that is exactly what they want.
> > > > > >
> > > > > > IOW, if the user didn't want this memory to be backed by hugepages,
> > > > > > they wouldn't be MADV_COLLAPSE'ing it. If there was a range of memory
> > > > > > the user wanted collapsed, but that had some sub-areas marked
> > > > > > MADV_NOHUGEPAGE, they could always issue multiple MADV_COLLAPSE
> > > > > > operations around the excluded regions.
> > > > > >
> > > > > > In terms of use cases, I don't have a concrete example, but a user
> > > > > > could hypothetically choose to exclude regions from management from
> > > > > > khugepaged, but still be able to collapse the memory themselves,
> > > > > > when/if they deem appropriate.
> > > > >
> > > > > I see. It seems you thought MADV_COLLAPSE actually unsets
> > > > > VM_NOHUGEPAGE, and is kind of equal to MADV_HUGEPAGE + doing collapse
> > > > > right away, right? To some degree, it makes some sense.
> > > >
> > > > Currently, MADV_COLLAPSE doesn't alter the vma flags at all - it just
> > > > ignores VM_NOHUGEPAGE, and so it's not really the same as
> > > > MADV_HUGEPAGE + MADV_COLLAPSE (which would set VM_HUGEPAGE in addition
> > > > to clearing VM_NOHUGEPAGE). If my use case has any merit (and I'm not
> > > > sure it does) then we don't want to be altering the vma flags since we
> > > > don't want to touch khugepaged behavior.
> > > >
> > > > > If this is the
> > > > > behavior you'd like to achieve, I'd suggest making it more explicit,
> > > > > for example, setting VM_HUGEPAGE for the MADV_COLLAPSE area rather
> > > > > than ignore or change vm flags silently. When using madvise mode, but
> > > > > not having VM_HUGEPAGE set, the vma check should fail in the current
> > > > > code (I didn't look hard if you already covered this or not).
> > > > >
> > > >
> > > > You're correct, this will fail, since it's following the same
> > > > semantics as the fault path. I see what you're saying though; that
> > > > perhaps this is inconsistent with my above reasoning that "the user
> > > > asked to collapse this memory, and so we should do it". If so, then
> > > > perhaps MADV_COLLAPSE just ignores madise mode and VM_[NO]HUGEPAGE
> > > > entirely for the purposes of eligibility, and only uses it for the
> > > > purposes of determining gfp flags for compaction/reclaim. Pushing that
> > > > further, compaction/reclaim could entirely be specified by the user
> > > > using a process_madvise(2) flag (later in the series, we do something
> > > > like this).
> > >
> > > Anyway I think we could have two options for MADV_COLLAPSE:
> > >
> > > 1. Just treat it as a hint (nice to have, best effort). It should obey
> > > all the settings. Skip VM_NOHUGEPAGE vmas or vmas without VM_HUGEPAGE
> > > if madvise mode, etc.
> > >
> > > 2. Much stronger. It equals MADV_HUGEPAGE + synchronous collapse. It
> > > should set vma flags properly as I suggested.
> > >
> > > Either is fine to me. But I don't prefer something in between personally.
> > >
> >
> > Makes sense to be consistent. Of these, #1 seems the most
> > straightforward to use. Doing an MADV_COLLAPSE on a VM_NOHUGEPAGE vma
> > seems like a corner case. The more likely scenario is MADV_COLLAPSE on
> > an unflagged (neither VM_HUGEPAGE or VM_NOHUGEPAGE) vma - in which
> > case it's less intrusive to not additionally set VM_HUGEPAGE (though
> > the user can always do so if they wish). It's a little more consistent
> > with "always" mode, where MADV_HUGEPAGE isn't necessary for
> > eligibility. It'll also reduce some code complexity.
> >
> > I'll float one last option your way, however:
> >
> > 3. The collapsed region is always eligible, regardless of vma flags or
> > thp settings (except "never"?). For process_madvise(2), a flag will
> > explicitly specify defrag semantics.
>
> This is what I meant for #2 IIUC. Defrag could follow the system's
> defrag setting rather than the khugepaged's.
>
> But it may break s390 as David pointed out.
>
> >
> > This separates "async-hint" vs "sync-explicit" madvise requests.
> > MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> > the kernel how to treat memory in the future. The kernel uses
> > VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> > request, is free to define its own defrag semantics.
> >
> > This would allow flexibility to separately define async vs sync thp
> > policies. For example, highly tuned userspace applications that are
> > sensitive to unexpected latency might want to manage their hugepages
> > utilization themselves, and ask khugepaged to stay away. There is no
> > way in "always" mode to do this without setting VM_NOHUGEPAGE.
>
> I don't quite get why you set THP to always but don't want to
> khugepaged do its job. It may be slow, I think this is why you
> introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
> scan the same area, it just doesn't do any real work and waste some
> cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
> being split, right? So khugepaged still plays a role to re-collapse
> the area without calling MADV_COLLAPSE over again and again.
>

Ya, I agree that the common case is that, if you are MADV_COLLAPSE'ing
memory, chances are you just want that memory backed by hugepages -
and so if that area were to be split, presumably we'd want khugepaged
to come and recollapse when possible.

I think the (possibly contrived) use case I was thinking about a
program that (a) didn't have ability to change thp settings
("always"), and (b) wanted to manage it's own hugepages. If a concrete
use case like this did arise, I think David H.'s suggestion of using
prctl(2) would work.

> >
> > > >
> > > >
> > > > > >
> > > > > > > >
> > > > > > > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > > > > > > which can be used to ignore vm flags used when considering thp
> > > > > > > > eligibility.
> > > > > > > >
> > > > > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > > > > > ---
> > > > > > > >  mm/khugepaged.c | 18 ++++++++++++------
> > > > > > > >  1 file changed, 12 insertions(+), 6 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > > > > index 1d20be47bcea..ecbd3fc41c80 100644
> > > > > > > > --- a/mm/khugepaged.c
> > > > > > > > +++ b/mm/khugepaged.c
> > > > > > > > @@ -964,10 +964,14 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > > > > > >  #endif
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > - * Revalidate a vma's eligibility to collapse nr hugepages.
> > > > > > > > + * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
> > > > > > > > + * can be used to ignore certain vma_flags that would otherwise be checked -
> > > > > > > > + * the principal example being VM_NOHUGEPAGE which is ignored in madvise
> > > > > > > > + * collapse context.
> > > > > > > >   */
> > > > > > > >  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > > >                                              unsigned long address, int nr,
> > > > > > > > +                                            unsigned long vm_flags_ignore,
> > > > > > > >                                              struct vm_area_struct **vmap)
> > > > > > > >  {
> > > > > > > >         struct vm_area_struct *vma;
> > > > > > > > @@ -986,7 +990,7 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > > >         hend = vma->vm_end & HPAGE_PMD_MASK;
> > > > > > > >         if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
> > > > > > > >                 return SCAN_ADDRESS_RANGE;
> > > > > > > > -       if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > > > > > +       if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
> > > > > > > >                 return SCAN_VMA_CHECK;
> > > > > > > >         /* Anon VMA expected */
> > > > > > > >         if (!vma->anon_vma || vma->vm_ops)
> > > > > > > > @@ -1000,9 +1004,11 @@ static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
> > > > > > > >   */
> > > > > > > >
> > > > > > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > > > > > +                                  unsigned long vm_flags_ignore,
> > > > > > > >                                    struct vm_area_struct **vmap)
> > > > > > > >  {
> > > > > > > > -       return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
> > > > > > > > +       return hugepage_vma_revalidate_pmd_count(mm, address, 1,
> > > > > > > > +                       vm_flags_ignore, vmap);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > @@ -1043,7 +1049,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> > > > > > > >                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
> > > > > > > >                 if (ret & VM_FAULT_RETRY) {
> > > > > > > >                         mmap_read_lock(mm);
> > > > > > > > -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> > > > > > > > +                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> > > > > > > >                                 /* vma is no longer available, don't continue to swapin */
> > > > > > > >                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> > > > > > > >                                 return false;
> > > > > > > > @@ -1200,7 +1206,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > > > >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> > > > > > > >
> > > > > > > >         mmap_read_lock(mm);
> > > > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > > > >         if (result) {
> > > > > > > >                 mmap_read_unlock(mm);
> > > > > > > >                 goto out_nolock;
> > > > > > > > @@ -1232,7 +1238,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > > > > >          */
> > > > > > > >         mmap_write_lock(mm);
> > > > > > > >
> > > > > > > > -       result = hugepage_vma_revalidate(mm, address, &vma);
> > > > > > > > +       result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
> > > > > > > >         if (result)
> > > > > > > >                 goto out_up_write;
> > > > > > > >         /* check if the pmd is still valid */
> > > > > > > > --
> > > > > > > > 2.35.1.616.g0bdcbb4464-goog
> > > > > > > >
David Rientjes March 10, 2022, 6:54 p.m. UTC | #12
On Thu, 10 Mar 2022, David Hildenbrand wrote:

> On 08.03.22 22:34, Zach O'Keefe wrote:
> > In madvise collapse context, we optionally want to be able to ignore
> > advice from MADV_NOHUGEPAGE-marked regions.
> > 
> > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > which can be used to ignore vm flags used when considering thp
> > eligibility.
> 
> arch/s390/mm/gmap.c:thp_split_mm() sets VM_NOHUGEPAGE to make sure there
> are *really* no thp. Being able to bypass that would break KVM horribly.
> 
> Ignoring MADV_NOHUGEPAGE/VM_NOHUGEPAGE feels like the wrong way to go.
> 

Agreed, we'll have to remove this possibility.

> What about a prctl instead, to disable any khugepagd activity and just
> let that process control it manually?
> 

No objection to the prctl, although it's unfortunate that the existing 
PR_SET_THP_DISABLE simply disables thp for the process entirely for any 
non-zero value and that this wasn't implemented as a bitmask to specify 
future behavior where this new behavior could be defined :/

I'll note, however, that we'd have no immediate use case ourselves for the 
prctl, although others may.  Our approach will likely be to disable 
khugepaged entirely in favor of outsourcing hugepage policy decisions to 
userspace based on a number of different signals.  (In fact, also doing 
thp enabled = madvise system wide)
Zach O'Keefe March 10, 2022, 6:58 p.m. UTC | #13
On Thu, Mar 10, 2022 at 10:46 AM David Rientjes <rientjes@google.com> wrote:
>
> On Thu, 10 Mar 2022, Yang Shi wrote:
>
> > > This separates "async-hint" vs "sync-explicit" madvise requests.
> > > MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> > > the kernel how to treat memory in the future. The kernel uses
> > > VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> > > request, is free to define its own defrag semantics.
> > >
> > > This would allow flexibility to separately define async vs sync thp
> > > policies. For example, highly tuned userspace applications that are
> > > sensitive to unexpected latency might want to manage their hugepages
> > > utilization themselves, and ask khugepaged to stay away. There is no
> > > way in "always" mode to do this without setting VM_NOHUGEPAGE.
> >
> > I don't quite get why you set THP to always but don't want to
> > khugepaged do its job. It may be slow, I think this is why you
> > introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
> > scan the same area, it just doesn't do any real work and waste some
> > cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
> > being split, right? So khugepaged still plays a role to re-collapse
> > the area without calling MADV_COLLAPSE over again and again.
> >
>
> My only real concern for MADV_COLLAPSE was when the span being collapsed
> includes a mixture of both VM_HUGEPAGE and VM_NOHUGEPAGE.  Does this
> collapse over the eligible memory or does it fail entirely?
>
> I'd think it was the former, that we should respect VM_NOHUGEPAGE and only
> collapse eligible memory when doing MADV_COLLAPSE but now userspace
> struggles to know whether it was a partial collapse because of
> ineligiblity or because we just couldn't allocate a hugepage.
>
> It has the information to figure this out on its own, so given the use of
> VM_NOHUGEPAGE for non-MADV_NOHUGEPAGE purposes, I think it makes sense to
> simply ignore these vmas as part of the collapse request.

Ignoring these vmas SGTM. Thanks All.
Yang Shi March 10, 2022, 7:54 p.m. UTC | #14
On Thu, Mar 10, 2022 at 10:46 AM David Rientjes <rientjes@google.com> wrote:
>
> On Thu, 10 Mar 2022, Yang Shi wrote:
>
> > > This separates "async-hint" vs "sync-explicit" madvise requests.
> > > MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> > > the kernel how to treat memory in the future. The kernel uses
> > > VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> > > request, is free to define its own defrag semantics.
> > >
> > > This would allow flexibility to separately define async vs sync thp
> > > policies. For example, highly tuned userspace applications that are
> > > sensitive to unexpected latency might want to manage their hugepages
> > > utilization themselves, and ask khugepaged to stay away. There is no
> > > way in "always" mode to do this without setting VM_NOHUGEPAGE.
> >
> > I don't quite get why you set THP to always but don't want to
> > khugepaged do its job. It may be slow, I think this is why you
> > introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
> > scan the same area, it just doesn't do any real work and waste some
> > cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
> > being split, right? So khugepaged still plays a role to re-collapse
> > the area without calling MADV_COLLAPSE over again and again.
> >
>
> My only real concern for MADV_COLLAPSE was when the span being collapsed
> includes a mixture of both VM_HUGEPAGE and VM_NOHUGEPAGE.  Does this
> collapse over the eligible memory or does it fail entirely?
>
> I'd think it was the former, that we should respect VM_NOHUGEPAGE and only
> collapse eligible memory when doing MADV_COLLAPSE but now userspace
> struggles to know whether it was a partial collapse because of
> ineligiblity or because we just couldn't allocate a hugepage.

Yes, I agree we should just try to collapse eligible vmas.

Since we are using madvise, we'd better follow its convention. We
could return different values for different failures, for example:
1. All vmas are collapsed successfully, return 0 (success)
2. Run into ineligible vma, return -EINVAL
3. Can't allocate hugepage, return -ENOMEM

Or just simply return 0 for success or a single error code for all
failure cases.

>
> It has the information to figure this out on its own, so given the use of
> VM_NOHUGEPAGE for non-MADV_NOHUGEPAGE purposes, I think it makes sense to
> simply ignore these vmas as part of the collapse request.
Zach O'Keefe March 10, 2022, 8:24 p.m. UTC | #15
On Thu, Mar 10, 2022 at 11:54 AM Yang Shi <shy828301@gmail.com> wrote:
>
> On Thu, Mar 10, 2022 at 10:46 AM David Rientjes <rientjes@google.com> wrote:
> >
> > On Thu, 10 Mar 2022, Yang Shi wrote:
> >
> > > > This separates "async-hint" vs "sync-explicit" madvise requests.
> > > > MADV_[NO]HUGEPAGE are hints, and together with thp settings, advise
> > > > the kernel how to treat memory in the future. The kernel uses
> > > > VM_[NO]HUGEPAGE to aid with this. MADV_COLLAPSE, as an explicit
> > > > request, is free to define its own defrag semantics.
> > > >
> > > > This would allow flexibility to separately define async vs sync thp
> > > > policies. For example, highly tuned userspace applications that are
> > > > sensitive to unexpected latency might want to manage their hugepages
> > > > utilization themselves, and ask khugepaged to stay away. There is no
> > > > way in "always" mode to do this without setting VM_NOHUGEPAGE.
> > >
> > > I don't quite get why you set THP to always but don't want to
> > > khugepaged do its job. It may be slow, I think this is why you
> > > introduce MADV_COLLAPSE, right? But it doesn't mean khugepaged can't
> > > scan the same area, it just doesn't do any real work and waste some
> > > cpu cycles. But I guess MADV_COLLAPSE doesn't prevent the PMD/THP from
> > > being split, right? So khugepaged still plays a role to re-collapse
> > > the area without calling MADV_COLLAPSE over again and again.
> > >
> >
> > My only real concern for MADV_COLLAPSE was when the span being collapsed
> > includes a mixture of both VM_HUGEPAGE and VM_NOHUGEPAGE.  Does this
> > collapse over the eligible memory or does it fail entirely?
> >
> > I'd think it was the former, that we should respect VM_NOHUGEPAGE and only
> > collapse eligible memory when doing MADV_COLLAPSE but now userspace
> > struggles to know whether it was a partial collapse because of
> > ineligiblity or because we just couldn't allocate a hugepage.
>
> Yes, I agree we should just try to collapse eligible vmas.
>
> Since we are using madvise, we'd better follow its convention. We
> could return different values for different failures, for example:
> 1. All vmas are collapsed successfully, return 0 (success)
> 2. Run into ineligible vma, return -EINVAL
> 3. Can't allocate hugepage, return -ENOMEM
>
> Or just simply return 0 for success or a single error code for all
> failure cases.
>

Different codes has a benefit (assuming -EINVAL takes precedence over
-EAGAIN (AFAIK madvise convention for mem not available)): A lazy user
wouldn't need to read smaps if -EAGAIN, they could just reissue the
syscall again over the same range, at a later time.

> >
> > It has the information to figure this out on its own, so given the use of
> > VM_NOHUGEPAGE for non-MADV_NOHUGEPAGE purposes, I think it makes sense to
> > simply ignore these vmas as part of the collapse request.
Michal Hocko March 21, 2022, 2:27 p.m. UTC | #16
[Dropped Richard Henderson from the CC list as the delivery fails for
 him]

On Thu 10-03-22 10:54:03, David Rientjes wrote:
> On Thu, 10 Mar 2022, David Hildenbrand wrote:
> 
> > On 08.03.22 22:34, Zach O'Keefe wrote:
> > > In madvise collapse context, we optionally want to be able to ignore
> > > advice from MADV_NOHUGEPAGE-marked regions.
> > > 
> > > Add a vm_flags_ignore argument to hugepage_vma_revalidate_pmd_count()
> > > which can be used to ignore vm flags used when considering thp
> > > eligibility.
> > 
> > arch/s390/mm/gmap.c:thp_split_mm() sets VM_NOHUGEPAGE to make sure there
> > are *really* no thp. Being able to bypass that would break KVM horribly.
> > 
> > Ignoring MADV_NOHUGEPAGE/VM_NOHUGEPAGE feels like the wrong way to go.
> > 
> 
> Agreed, we'll have to remove this possibility.

yeah, this sounds like a bug to me.

> > What about a prctl instead, to disable any khugepagd activity and just
> > let that process control it manually?
> > 
> 
> No objection to the prctl, although it's unfortunate that the existing 
> PR_SET_THP_DISABLE simply disables thp for the process entirely for any 
> non-zero value and that this wasn't implemented as a bitmask to specify 
> future behavior where this new behavior could be defined :/

I do not think PR_SET_THP_DISABLE is any different from VM_NOHUGEPAGE.
The process (owner) has opeted out from THPs for different reasons.
Those might be unknown to whoever calls the madvise call (including
itself).
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1d20be47bcea..ecbd3fc41c80 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -964,10 +964,14 @@  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 #endif
 
 /*
- * Revalidate a vma's eligibility to collapse nr hugepages.
+ * Revalidate a vma's eligibility to collapse nr hugepages. vm_flags_ignore
+ * can be used to ignore certain vma_flags that would otherwise be checked -
+ * the principal example being VM_NOHUGEPAGE which is ignored in madvise
+ * collapse context.
  */
 static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
 					     unsigned long address, int nr,
+					     unsigned long vm_flags_ignore,
 					     struct vm_area_struct **vmap)
 {
 	struct vm_area_struct *vma;
@@ -986,7 +990,7 @@  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (address < hstart || (address + nr * HPAGE_PMD_SIZE) > hend)
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags))
+	if (!hugepage_vma_check(vma, vma->vm_flags & ~vm_flags_ignore))
 		return SCAN_VMA_CHECK;
 	/* Anon VMA expected */
 	if (!vma->anon_vma || vma->vm_ops)
@@ -1000,9 +1004,11 @@  static int hugepage_vma_revalidate_pmd_count(struct mm_struct *mm,
  */
 
 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+				   unsigned long vm_flags_ignore,
 				   struct vm_area_struct **vmap)
 {
-	return hugepage_vma_revalidate_pmd_count(mm, address, 1, vmap);
+	return hugepage_vma_revalidate_pmd_count(mm, address, 1,
+			vm_flags_ignore, vmap);
 }
 
 /*
@@ -1043,7 +1049,7 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
 		if (ret & VM_FAULT_RETRY) {
 			mmap_read_lock(mm);
-			if (hugepage_vma_revalidate(mm, haddr, &vma)) {
+			if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
 				/* vma is no longer available, don't continue to swapin */
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
@@ -1200,7 +1206,7 @@  static void collapse_huge_page(struct mm_struct *mm,
 	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, &vma);
+	result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
 	if (result) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
@@ -1232,7 +1238,7 @@  static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	mmap_write_lock(mm);
 
-	result = hugepage_vma_revalidate(mm, address, &vma);
+	result = hugepage_vma_revalidate(mm, address, VM_NONE, &vma);
 	if (result)
 		goto out_up_write;
 	/* check if the pmd is still valid */