diff mbox series

[v4,4/6] mm: Introduce a pageflag for partially mapped folios

Message ID 20240819023145.2415299-5-usamaarif642@gmail.com (mailing list archive)
State New
Headers show
Series mm: split underused THPs | expand

Commit Message

Usama Arif Aug. 19, 2024, 2:30 a.m. UTC
Currently folio->_deferred_list is used to keep track of
partially_mapped folios that are going to be split under memory
pressure. In the next patch, all THPs that are faulted in and collapsed
by khugepaged are also going to be tracked using _deferred_list.

This patch introduces a pageflag to be able to distinguish between
partially mapped folios and others in the deferred_list at split time in
deferred_split_scan. Its needed as __folio_remove_rmap decrements
_mapcount, _large_mapcount and _entire_mapcount, hence it won't be
possible to distinguish between partially mapped folios and others in
deferred_split_scan.

Eventhough it introduces an extra flag to track if the folio is
partially mapped, there is no functional change intended with this
patch and the flag is not useful in this patch itself, it will
become useful in the next patch when _deferred_list has non partially
mapped folios.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/huge_mm.h    |  4 ++--
 include/linux/page-flags.h | 11 +++++++++++
 mm/huge_memory.c           | 23 ++++++++++++++++-------
 mm/internal.h              |  4 +++-
 mm/memcontrol.c            |  3 ++-
 mm/migrate.c               |  3 ++-
 mm/page_alloc.c            |  5 +++--
 mm/rmap.c                  |  5 +++--
 mm/vmscan.c                |  3 ++-
 9 files changed, 44 insertions(+), 17 deletions(-)

Comments

Barry Song Aug. 19, 2024, 8:29 a.m. UTC | #1
Hi Usama,

I feel it is much better now! thanks!

On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
>
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
>
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/huge_mm.h    |  4 ++--
>  include/linux/page-flags.h | 11 +++++++++++
>  mm/huge_memory.c           | 23 ++++++++++++++++-------
>  mm/internal.h              |  4 +++-
>  mm/memcontrol.c            |  3 ++-
>  mm/migrate.c               |  3 ++-
>  mm/page_alloc.c            |  5 +++--
>  mm/rmap.c                  |  5 +++--
>  mm/vmscan.c                |  3 ++-
>  9 files changed, 44 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return 0;
>  }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..c3bb0e0da581 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>         /* At least one page in this folio has the hwpoison flag set */
>         PG_has_hwpoisoned = PG_active,
>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>  };
>
>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>         ClearPageHead(page);
>  }
>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> +/*
> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> + * so its safe to use non-atomic set/clear.
> + */
> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>  #else
>  FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>  #endif
>
>  #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2d77b5d2291e..70ee49dfeaad 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                          * page_deferred_list.
>                          */
>                         list_del_init(&folio->_deferred_list);
> +                       __folio_clear_partially_mapped(folio);
>                 }
>                 spin_unlock(&ds_queue->split_queue_lock);
>                 if (mapping) {
> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> +               __folio_clear_partially_mapped(folio);

is it possible to make things clearer by

 if (folio_clear_partially_mapped)
    __folio_clear_partially_mapped(folio);

While writing without conditions isn't necessarily wrong, adding a condition
will improve the readability of the code and enhance the clarity of my mTHP
counters series. also help decrease smp cache sync if we can avoid
unnecessary writing?

>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> -void deferred_split_folio(struct folio *folio)
> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +       if (partially_mapped) {
> +               if (!folio_test_partially_mapped(folio)) {
> +                       __folio_set_partially_mapped(folio);
> +                       if (folio_test_pmd_mappable(folio))
> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               }
> +       } else {
> +               /* partially mapped folios cannot become non-partially mapped */
> +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> +       }
>         if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> +                       __folio_clear_partially_mapped(folio);

as above? Do we also need if(test) for split_huge_page_to_list_to_order()?

>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
> diff --git a/mm/internal.h b/mm/internal.h
> index 52f7fc4e8ac3..27cbb5365841 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>         atomic_set(&folio->_entire_mapcount, -1);
>         atomic_set(&folio->_nr_pages_mapped, 0);
>         atomic_set(&folio->_pincount, 0);
> -       if (order > 1)
> +       if (order > 1) {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               __folio_clear_partially_mapped(folio);

if partially_mapped is true for a new folio, does it mean we already have
a bug somewhere?

How is it possible for a new folio to be partially mapped?

> +       }
>  }
>
>  static inline void prep_compound_tail(struct page *head, int tail_idx)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1ffd2950393..0fd95daecf9a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>                         !folio_test_hugetlb(folio) &&
> -                       !list_empty(&folio->_deferred_list), folio);
> +                       !list_empty(&folio->_deferred_list) &&
> +                       folio_test_partially_mapped(folio), folio);
>
>         /*
>          * Nobody should be changing or seriously looking at
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 2d2e65d69427..ef4a732f22b1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
>                          * use _deferred_list.
>                          */
>                         if (nr_pages > 2 &&
> -                          !list_empty(&folio->_deferred_list)) {
> +                          !list_empty(&folio->_deferred_list) &&
> +                          folio_test_partially_mapped(folio)) {
>                                 if (!try_split_folio(folio, split_folios, mode)) {
>                                         nr_failed++;
>                                         stats->nr_thp_failed += is_thp;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..a145c550dd2a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>                 break;
>         case 2:
>                 /* the second tail page: deferred_list overlaps ->mapping */
> -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> -                       bad_page(page, "on deferred list");
> +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> +                   folio_test_partially_mapped(folio))) {
> +                       bad_page(page, "partially mapped folio on deferred list");
>                         goto out;
>                 }
>                 break;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a6b9cd0b2b18..4c330635aa4e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>          * Check partially_mapped first to ensure it is a large folio.
>          */
>         if (partially_mapped && folio_test_anon(folio) &&
> -           list_empty(&folio->_deferred_list))
> -               deferred_split_folio(folio);
> +           !folio_test_partially_mapped(folio))
> +               deferred_split_folio(folio, true);
> +
>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>
>         /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 25e43bb3b574..25f4e8403f41 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>                                          * Split partially mapped folios right away.
>                                          * We can free the unmapped pages without IO.
>                                          */
> -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> +                                           folio_test_partially_mapped(folio)) &&
>                                             split_folio_to_list(folio, folio_list))
>                                                 goto activate_locked;
>                                 }
> --
> 2.43.5
>

Thanks
Barry
Barry Song Aug. 19, 2024, 8:30 a.m. UTC | #2
On Mon, Aug 19, 2024 at 8:29 PM Barry Song <baohua@kernel.org> wrote:
>
> Hi Usama,
>
> I feel it is much better now! thanks!
>
> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> > Currently folio->_deferred_list is used to keep track of
> > partially_mapped folios that are going to be split under memory
> > pressure. In the next patch, all THPs that are faulted in and collapsed
> > by khugepaged are also going to be tracked using _deferred_list.
> >
> > This patch introduces a pageflag to be able to distinguish between
> > partially mapped folios and others in the deferred_list at split time in
> > deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > possible to distinguish between partially mapped folios and others in
> > deferred_split_scan.
> >
> > Eventhough it introduces an extra flag to track if the folio is
> > partially mapped, there is no functional change intended with this
> > patch and the flag is not useful in this patch itself, it will
> > become useful in the next patch when _deferred_list has non partially
> > mapped folios.
> >
> > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > ---
> >  include/linux/huge_mm.h    |  4 ++--
> >  include/linux/page-flags.h | 11 +++++++++++
> >  mm/huge_memory.c           | 23 ++++++++++++++++-------
> >  mm/internal.h              |  4 +++-
> >  mm/memcontrol.c            |  3 ++-
> >  mm/migrate.c               |  3 ++-
> >  mm/page_alloc.c            |  5 +++--
> >  mm/rmap.c                  |  5 +++--
> >  mm/vmscan.c                |  3 ++-
> >  9 files changed, 44 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 4c32058cacfe..969f11f360d2 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >  {
> >         return split_huge_page_to_list_to_order(page, NULL, 0);
> >  }
> > -void deferred_split_folio(struct folio *folio);
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >
> >  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >                 unsigned long address, bool freeze, struct folio *folio);
> > @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >  {
> >         return 0;
> >  }
> > -static inline void deferred_split_folio(struct folio *folio) {}
> > +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >  #define split_huge_pmd(__vma, __pmd, __address)        \
> >         do { } while (0)
> >
> > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > index a0a29bd092f8..c3bb0e0da581 100644
> > --- a/include/linux/page-flags.h
> > +++ b/include/linux/page-flags.h
> > @@ -182,6 +182,7 @@ enum pageflags {
> >         /* At least one page in this folio has the hwpoison flag set */
> >         PG_has_hwpoisoned = PG_active,
> >         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >  };
> >
> >  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> > @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >         ClearPageHead(page);
> >  }
> >  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > +/*
> > + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> > + * so its safe to use non-atomic set/clear.
> > + */
> > +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >  #else
> >  FOLIO_FLAG_FALSE(large_rmappable)
> > +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> > +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> > +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >  #endif
> >
> >  #define PG_head_mask ((1UL << PG_head))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 2d77b5d2291e..70ee49dfeaad 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >                          * page_deferred_list.
> >                          */
> >                         list_del_init(&folio->_deferred_list);
> > +                       __folio_clear_partially_mapped(folio);
> >                 }
> >                 spin_unlock(&ds_queue->split_queue_lock);
> >                 if (mapping) {
> > @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >         if (!list_empty(&folio->_deferred_list)) {
> >                 ds_queue->split_queue_len--;
> >                 list_del_init(&folio->_deferred_list);
> > +               __folio_clear_partially_mapped(folio);
>
> is it possible to make things clearer by
>
>  if (folio_clear_partially_mapped)
>     __folio_clear_partially_mapped(folio);

sorry for typo,
if (folio_test_partially_mapped)
    __folio_clear_partially_mapped(folio);

>
> While writing without conditions isn't necessarily wrong, adding a condition
> will improve the readability of the code and enhance the clarity of my mTHP
> counters series. also help decrease smp cache sync if we can avoid
> unnecessary writing?
>
> >         }
> >         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >  }
> >
> > -void deferred_split_folio(struct folio *folio)
> > +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >  {
> >         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >  #ifdef CONFIG_MEMCG
> > @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
> >         if (folio_test_swapcache(folio))
> >                 return;
> >
> > -       if (!list_empty(&folio->_deferred_list))
> > -               return;
> > -
> >         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > +       if (partially_mapped) {
> > +               if (!folio_test_partially_mapped(folio)) {
> > +                       __folio_set_partially_mapped(folio);
> > +                       if (folio_test_pmd_mappable(folio))
> > +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +               }
> > +       } else {
> > +               /* partially mapped folios cannot become non-partially mapped */
> > +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> > +       }
> >         if (list_empty(&folio->_deferred_list)) {
> > -               if (folio_test_pmd_mappable(folio))
> > -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >                 ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
> > @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >                 } else {
> >                         /* We lost race with folio_put() */
> >                         list_del_init(&folio->_deferred_list);
> > +                       __folio_clear_partially_mapped(folio);
>
> as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
>
> >                         ds_queue->split_queue_len--;
> >                 }
> >                 if (!--sc->nr_to_scan)
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 52f7fc4e8ac3..27cbb5365841 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> >         atomic_set(&folio->_entire_mapcount, -1);
> >         atomic_set(&folio->_nr_pages_mapped, 0);
> >         atomic_set(&folio->_pincount, 0);
> > -       if (order > 1)
> > +       if (order > 1) {
> >                 INIT_LIST_HEAD(&folio->_deferred_list);
> > +               __folio_clear_partially_mapped(folio);
>
> if partially_mapped is true for a new folio, does it mean we already have
> a bug somewhere?
>
> How is it possible for a new folio to be partially mapped?
>
> > +       }
> >  }
> >
> >  static inline void prep_compound_tail(struct page *head, int tail_idx)
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index e1ffd2950393..0fd95daecf9a 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> >         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> >                         !folio_test_hugetlb(folio) &&
> > -                       !list_empty(&folio->_deferred_list), folio);
> > +                       !list_empty(&folio->_deferred_list) &&
> > +                       folio_test_partially_mapped(folio), folio);
> >
> >         /*
> >          * Nobody should be changing or seriously looking at
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 2d2e65d69427..ef4a732f22b1 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
> >                          * use _deferred_list.
> >                          */
> >                         if (nr_pages > 2 &&
> > -                          !list_empty(&folio->_deferred_list)) {
> > +                          !list_empty(&folio->_deferred_list) &&
> > +                          folio_test_partially_mapped(folio)) {
> >                                 if (!try_split_folio(folio, split_folios, mode)) {
> >                                         nr_failed++;
> >                                         stats->nr_thp_failed += is_thp;
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 408ef3d25cf5..a145c550dd2a 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> >                 break;
> >         case 2:
> >                 /* the second tail page: deferred_list overlaps ->mapping */
> > -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> > -                       bad_page(page, "on deferred list");
> > +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> > +                   folio_test_partially_mapped(folio))) {
> > +                       bad_page(page, "partially mapped folio on deferred list");
> >                         goto out;
> >                 }
> >                 break;
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a6b9cd0b2b18..4c330635aa4e 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >          * Check partially_mapped first to ensure it is a large folio.
> >          */
> >         if (partially_mapped && folio_test_anon(folio) &&
> > -           list_empty(&folio->_deferred_list))
> > -               deferred_split_folio(folio);
> > +           !folio_test_partially_mapped(folio))
> > +               deferred_split_folio(folio, true);
> > +
> >         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >
> >         /*
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 25e43bb3b574..25f4e8403f41 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> >                                          * Split partially mapped folios right away.
> >                                          * We can free the unmapped pages without IO.
> >                                          */
> > -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> > +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> > +                                           folio_test_partially_mapped(folio)) &&
> >                                             split_folio_to_list(folio, folio_list))
> >                                                 goto activate_locked;
> >                                 }
> > --
> > 2.43.5
> >
>
> Thanks
> Barry
Usama Arif Aug. 19, 2024, 2:17 p.m. UTC | #3
On 19/08/2024 09:29, Barry Song wrote:
> Hi Usama,
> 
> I feel it is much better now! thanks!
> 
> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> Currently folio->_deferred_list is used to keep track of
>> partially_mapped folios that are going to be split under memory
>> pressure. In the next patch, all THPs that are faulted in and collapsed
>> by khugepaged are also going to be tracked using _deferred_list.
>>
>> This patch introduces a pageflag to be able to distinguish between
>> partially mapped folios and others in the deferred_list at split time in
>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>> possible to distinguish between partially mapped folios and others in
>> deferred_split_scan.
>>
>> Eventhough it introduces an extra flag to track if the folio is
>> partially mapped, there is no functional change intended with this
>> patch and the flag is not useful in this patch itself, it will
>> become useful in the next patch when _deferred_list has non partially
>> mapped folios.
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/huge_mm.h    |  4 ++--
>>  include/linux/page-flags.h | 11 +++++++++++
>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
>>  mm/internal.h              |  4 +++-
>>  mm/memcontrol.c            |  3 ++-
>>  mm/migrate.c               |  3 ++-
>>  mm/page_alloc.c            |  5 +++--
>>  mm/rmap.c                  |  5 +++--
>>  mm/vmscan.c                |  3 ++-
>>  9 files changed, 44 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 4c32058cacfe..969f11f360d2 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>  }
>> -void deferred_split_folio(struct folio *folio);
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>
>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>                 unsigned long address, bool freeze, struct folio *folio);
>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return 0;
>>  }
>> -static inline void deferred_split_folio(struct folio *folio) {}
>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>         do { } while (0)
>>
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index a0a29bd092f8..c3bb0e0da581 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -182,6 +182,7 @@ enum pageflags {
>>         /* At least one page in this folio has the hwpoison flag set */
>>         PG_has_hwpoisoned = PG_active,
>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>>  };
>>
>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>>         ClearPageHead(page);
>>  }
>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>> +/*
>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>> + * so its safe to use non-atomic set/clear.
>> + */
>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>  #else
>>  FOLIO_FLAG_FALSE(large_rmappable)
>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>>  #endif
>>
>>  #define PG_head_mask ((1UL << PG_head))
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2d77b5d2291e..70ee49dfeaad 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>                          * page_deferred_list.
>>                          */
>>                         list_del_init(&folio->_deferred_list);
>> +                       __folio_clear_partially_mapped(folio);
>>                 }
>>                 spin_unlock(&ds_queue->split_queue_lock);
>>                 if (mapping) {
>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>         if (!list_empty(&folio->_deferred_list)) {
>>                 ds_queue->split_queue_len--;
>>                 list_del_init(&folio->_deferred_list);
>> +               __folio_clear_partially_mapped(folio);
> 
> is it possible to make things clearer by
> 
>  if (folio_clear_partially_mapped)
>     __folio_clear_partially_mapped(folio);
> 
> While writing without conditions isn't necessarily wrong, adding a condition
> will improve the readability of the code and enhance the clarity of my mTHP
> counters series. also help decrease smp cache sync if we can avoid
> unnecessary writing?
> 

Do you mean if(folio_test_partially_mapped(folio))?

I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?


>>         }
>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>  }
>>
>> -void deferred_split_folio(struct folio *folio)
>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>  {
>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>  #ifdef CONFIG_MEMCG
>> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
>>         if (folio_test_swapcache(folio))
>>                 return;
>>
>> -       if (!list_empty(&folio->_deferred_list))
>> -               return;
>> -
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> +       if (partially_mapped) {
>> +               if (!folio_test_partially_mapped(folio)) {
>> +                       __folio_set_partially_mapped(folio);
>> +                       if (folio_test_pmd_mappable(folio))
>> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> +               }
>> +       } else {
>> +               /* partially mapped folios cannot become non-partially mapped */
>> +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>> +       }
>>         if (list_empty(&folio->_deferred_list)) {
>> -               if (folio_test_pmd_mappable(folio))
>> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>                 ds_queue->split_queue_len++;
>>  #ifdef CONFIG_MEMCG
>> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>                 } else {
>>                         /* We lost race with folio_put() */
>>                         list_del_init(&folio->_deferred_list);
>> +                       __folio_clear_partially_mapped(folio);
> 
> as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
> 
>>                         ds_queue->split_queue_len--;
>>                 }
>>                 if (!--sc->nr_to_scan)
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 52f7fc4e8ac3..27cbb5365841 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>>         atomic_set(&folio->_entire_mapcount, -1);
>>         atomic_set(&folio->_nr_pages_mapped, 0);
>>         atomic_set(&folio->_pincount, 0);
>> -       if (order > 1)
>> +       if (order > 1) {
>>                 INIT_LIST_HEAD(&folio->_deferred_list);
>> +               __folio_clear_partially_mapped(folio);
> 
> if partially_mapped is true for a new folio, does it mean we already have
> a bug somewhere?
> 
> How is it possible for a new folio to be partially mapped?
> 

Its not, I did it because I wanted to make it explicit that the folio is being initialized, similar to how before this INIT_LIST_HEAD(&folio->_deferred_list) is done here.

There is no functional issue in removing it here, because I believe the flag is initialized to false from start.
>> +       }
>>  }
>>
>>  static inline void prep_compound_tail(struct page *head, int tail_idx)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e1ffd2950393..0fd95daecf9a 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>>         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>>         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>>                         !folio_test_hugetlb(folio) &&
>> -                       !list_empty(&folio->_deferred_list), folio);
>> +                       !list_empty(&folio->_deferred_list) &&
>> +                       folio_test_partially_mapped(folio), folio);
>>
>>         /*
>>          * Nobody should be changing or seriously looking at
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 2d2e65d69427..ef4a732f22b1 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
>>                          * use _deferred_list.
>>                          */
>>                         if (nr_pages > 2 &&
>> -                          !list_empty(&folio->_deferred_list)) {
>> +                          !list_empty(&folio->_deferred_list) &&
>> +                          folio_test_partially_mapped(folio)) {
>>                                 if (!try_split_folio(folio, split_folios, mode)) {
>>                                         nr_failed++;
>>                                         stats->nr_thp_failed += is_thp;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 408ef3d25cf5..a145c550dd2a 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>>                 break;
>>         case 2:
>>                 /* the second tail page: deferred_list overlaps ->mapping */
>> -               if (unlikely(!list_empty(&folio->_deferred_list))) {
>> -                       bad_page(page, "on deferred list");
>> +               if (unlikely(!list_empty(&folio->_deferred_list) &&
>> +                   folio_test_partially_mapped(folio))) {
>> +                       bad_page(page, "partially mapped folio on deferred list");
>>                         goto out;
>>                 }
>>                 break;
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index a6b9cd0b2b18..4c330635aa4e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>          * Check partially_mapped first to ensure it is a large folio.
>>          */
>>         if (partially_mapped && folio_test_anon(folio) &&
>> -           list_empty(&folio->_deferred_list))
>> -               deferred_split_folio(folio);
>> +           !folio_test_partially_mapped(folio))
>> +               deferred_split_folio(folio, true);
>> +
>>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>
>>         /*
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 25e43bb3b574..25f4e8403f41 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>>                                          * Split partially mapped folios right away.
>>                                          * We can free the unmapped pages without IO.
>>                                          */
>> -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
>> +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
>> +                                           folio_test_partially_mapped(folio)) &&
>>                                             split_folio_to_list(folio, folio_list))
>>                                                 goto activate_locked;
>>                                 }
>> --
>> 2.43.5
>>
> 
> Thanks
> Barry
Barry Song Aug. 19, 2024, 7 p.m. UTC | #4
On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 09:29, Barry Song wrote:
> > Hi Usama,
> >
> > I feel it is much better now! thanks!
> >
> > On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> Currently folio->_deferred_list is used to keep track of
> >> partially_mapped folios that are going to be split under memory
> >> pressure. In the next patch, all THPs that are faulted in and collapsed
> >> by khugepaged are also going to be tracked using _deferred_list.
> >>
> >> This patch introduces a pageflag to be able to distinguish between
> >> partially mapped folios and others in the deferred_list at split time in
> >> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >> possible to distinguish between partially mapped folios and others in
> >> deferred_split_scan.
> >>
> >> Eventhough it introduces an extra flag to track if the folio is
> >> partially mapped, there is no functional change intended with this
> >> patch and the flag is not useful in this patch itself, it will
> >> become useful in the next patch when _deferred_list has non partially
> >> mapped folios.
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  include/linux/huge_mm.h    |  4 ++--
> >>  include/linux/page-flags.h | 11 +++++++++++
> >>  mm/huge_memory.c           | 23 ++++++++++++++++-------
> >>  mm/internal.h              |  4 +++-
> >>  mm/memcontrol.c            |  3 ++-
> >>  mm/migrate.c               |  3 ++-
> >>  mm/page_alloc.c            |  5 +++--
> >>  mm/rmap.c                  |  5 +++--
> >>  mm/vmscan.c                |  3 ++-
> >>  9 files changed, 44 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index 4c32058cacfe..969f11f360d2 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return split_huge_page_to_list_to_order(page, NULL, 0);
> >>  }
> >> -void deferred_split_folio(struct folio *folio);
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>
> >>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>                 unsigned long address, bool freeze, struct folio *folio);
> >> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return 0;
> >>  }
> >> -static inline void deferred_split_folio(struct folio *folio) {}
> >> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>  #define split_huge_pmd(__vma, __pmd, __address)        \
> >>         do { } while (0)
> >>
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index a0a29bd092f8..c3bb0e0da581 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -182,6 +182,7 @@ enum pageflags {
> >>         /* At least one page in this folio has the hwpoison flag set */
> >>         PG_has_hwpoisoned = PG_active,
> >>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >>  };
> >>
> >>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> >> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >>         ClearPageHead(page);
> >>  }
> >>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >> +/*
> >> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >> + * so its safe to use non-atomic set/clear.
> >> + */
> >> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>  #else
> >>  FOLIO_FLAG_FALSE(large_rmappable)
> >> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >>  #endif
> >>
> >>  #define PG_head_mask ((1UL << PG_head))
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 2d77b5d2291e..70ee49dfeaad 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>                          * page_deferred_list.
> >>                          */
> >>                         list_del_init(&folio->_deferred_list);
> >> +                       __folio_clear_partially_mapped(folio);
> >>                 }
> >>                 spin_unlock(&ds_queue->split_queue_lock);
> >>                 if (mapping) {
> >> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>         if (!list_empty(&folio->_deferred_list)) {
> >>                 ds_queue->split_queue_len--;
> >>                 list_del_init(&folio->_deferred_list);
> >> +               __folio_clear_partially_mapped(folio);
> >
> > is it possible to make things clearer by
> >
> >  if (folio_clear_partially_mapped)
> >     __folio_clear_partially_mapped(folio);
> >
> > While writing without conditions isn't necessarily wrong, adding a condition
> > will improve the readability of the code and enhance the clarity of my mTHP
> > counters series. also help decrease smp cache sync if we can avoid
> > unnecessary writing?
> >
>
> Do you mean if(folio_test_partially_mapped(folio))?
>
> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?

In the pmd-order case, the majority of folios are not partially mapped.
Unconditional writes will trigger cache synchronization across all
CPUs (related to the MESI protocol), making them more costly. By
using conditional writes, such as "if(test) write," we can avoid
most unnecessary writes, which is much more efficient. Additionally,
we only need to manage nr_split_deferred when the condition
is met. We are carefully evaluating all scenarios to determine
if modifications to the partially_mapped flag are necessary.

>
>
> >>         }
> >>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>  }
> >>
> >> -void deferred_split_folio(struct folio *folio)
> >> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>  {
> >>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >>  #ifdef CONFIG_MEMCG
> >> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
> >>         if (folio_test_swapcache(folio))
> >>                 return;
> >>
> >> -       if (!list_empty(&folio->_deferred_list))
> >> -               return;
> >> -
> >>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >> +       if (partially_mapped) {
> >> +               if (!folio_test_partially_mapped(folio)) {
> >> +                       __folio_set_partially_mapped(folio);
> >> +                       if (folio_test_pmd_mappable(folio))
> >> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >> +               }
> >> +       } else {
> >> +               /* partially mapped folios cannot become non-partially mapped */
> >> +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> >> +       }
> >>         if (list_empty(&folio->_deferred_list)) {
> >> -               if (folio_test_pmd_mappable(folio))
> >> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >>                 ds_queue->split_queue_len++;
> >>  #ifdef CONFIG_MEMCG
> >> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >>                 } else {
> >>                         /* We lost race with folio_put() */
> >>                         list_del_init(&folio->_deferred_list);
> >> +                       __folio_clear_partially_mapped(folio);
> >
> > as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
> >
> >>                         ds_queue->split_queue_len--;
> >>                 }
> >>                 if (!--sc->nr_to_scan)
> >> diff --git a/mm/internal.h b/mm/internal.h
> >> index 52f7fc4e8ac3..27cbb5365841 100644
> >> --- a/mm/internal.h
> >> +++ b/mm/internal.h
> >> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> >>         atomic_set(&folio->_entire_mapcount, -1);
> >>         atomic_set(&folio->_nr_pages_mapped, 0);
> >>         atomic_set(&folio->_pincount, 0);
> >> -       if (order > 1)
> >> +       if (order > 1) {
> >>                 INIT_LIST_HEAD(&folio->_deferred_list);
> >> +               __folio_clear_partially_mapped(folio);
> >
> > if partially_mapped is true for a new folio, does it mean we already have
> > a bug somewhere?
> >
> > How is it possible for a new folio to be partially mapped?
> >
>
> Its not, I did it because I wanted to make it explicit that the folio is being initialized, similar to how before this INIT_LIST_HEAD(&folio->_deferred_list) is done here.
>
> There is no functional issue in removing it here, because I believe the flag is initialized to false from start.
> >> +       }
> >>  }
> >>
> >>  static inline void prep_compound_tail(struct page *head, int tail_idx)
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index e1ffd2950393..0fd95daecf9a 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> >>         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >>         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> >>                         !folio_test_hugetlb(folio) &&
> >> -                       !list_empty(&folio->_deferred_list), folio);
> >> +                       !list_empty(&folio->_deferred_list) &&
> >> +                       folio_test_partially_mapped(folio), folio);
> >>
> >>         /*
> >>          * Nobody should be changing or seriously looking at
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 2d2e65d69427..ef4a732f22b1 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
> >>                          * use _deferred_list.
> >>                          */
> >>                         if (nr_pages > 2 &&
> >> -                          !list_empty(&folio->_deferred_list)) {
> >> +                          !list_empty(&folio->_deferred_list) &&
> >> +                          folio_test_partially_mapped(folio)) {
> >>                                 if (!try_split_folio(folio, split_folios, mode)) {
> >>                                         nr_failed++;
> >>                                         stats->nr_thp_failed += is_thp;
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 408ef3d25cf5..a145c550dd2a 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> >>                 break;
> >>         case 2:
> >>                 /* the second tail page: deferred_list overlaps ->mapping */
> >> -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> >> -                       bad_page(page, "on deferred list");
> >> +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> >> +                   folio_test_partially_mapped(folio))) {
> >> +                       bad_page(page, "partially mapped folio on deferred list");
> >>                         goto out;
> >>                 }
> >>                 break;
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index a6b9cd0b2b18..4c330635aa4e 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >>          * Check partially_mapped first to ensure it is a large folio.
> >>          */
> >>         if (partially_mapped && folio_test_anon(folio) &&
> >> -           list_empty(&folio->_deferred_list))
> >> -               deferred_split_folio(folio);
> >> +           !folio_test_partially_mapped(folio))
> >> +               deferred_split_folio(folio, true);
> >> +
> >>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >>
> >>         /*
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 25e43bb3b574..25f4e8403f41 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> >>                                          * Split partially mapped folios right away.
> >>                                          * We can free the unmapped pages without IO.
> >>                                          */
> >> -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> >> +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> >> +                                           folio_test_partially_mapped(folio)) &&
> >>                                             split_folio_to_list(folio, folio_list))
> >>                                                 goto activate_locked;
> >>                                 }
> >> --
> >> 2.43.5
> >>
> >
> > Thanks
> > Barry
Usama Arif Aug. 19, 2024, 8:16 p.m. UTC | #5
On 19/08/2024 20:00, Barry Song wrote:
> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 19/08/2024 09:29, Barry Song wrote:
>>> Hi Usama,
>>>
>>> I feel it is much better now! thanks!
>>>
>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> Currently folio->_deferred_list is used to keep track of
>>>> partially_mapped folios that are going to be split under memory
>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>
>>>> This patch introduces a pageflag to be able to distinguish between
>>>> partially mapped folios and others in the deferred_list at split time in
>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>> possible to distinguish between partially mapped folios and others in
>>>> deferred_split_scan.
>>>>
>>>> Eventhough it introduces an extra flag to track if the folio is
>>>> partially mapped, there is no functional change intended with this
>>>> patch and the flag is not useful in this patch itself, it will
>>>> become useful in the next patch when _deferred_list has non partially
>>>> mapped folios.
>>>>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>  include/linux/huge_mm.h    |  4 ++--
>>>>  include/linux/page-flags.h | 11 +++++++++++
>>>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
>>>>  mm/internal.h              |  4 +++-
>>>>  mm/memcontrol.c            |  3 ++-
>>>>  mm/migrate.c               |  3 ++-
>>>>  mm/page_alloc.c            |  5 +++--
>>>>  mm/rmap.c                  |  5 +++--
>>>>  mm/vmscan.c                |  3 ++-
>>>>  9 files changed, 44 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 4c32058cacfe..969f11f360d2 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>>>  }
>>>> -void deferred_split_folio(struct folio *folio);
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>
>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>>                 unsigned long address, bool freeze, struct folio *folio);
>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return 0;
>>>>  }
>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>>>         do { } while (0)
>>>>
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index a0a29bd092f8..c3bb0e0da581 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>         /* At least one page in this folio has the hwpoison flag set */
>>>>         PG_has_hwpoisoned = PG_active,
>>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>>>>  };
>>>>
>>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>>>>         ClearPageHead(page);
>>>>  }
>>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>> +/*
>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>>>> + * so its safe to use non-atomic set/clear.
>>>> + */
>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>  #else
>>>>  FOLIO_FLAG_FALSE(large_rmappable)
>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>>>>  #endif
>>>>
>>>>  #define PG_head_mask ((1UL << PG_head))
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 2d77b5d2291e..70ee49dfeaad 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>                          * page_deferred_list.
>>>>                          */
>>>>                         list_del_init(&folio->_deferred_list);
>>>> +                       __folio_clear_partially_mapped(folio);
>>>>                 }
>>>>                 spin_unlock(&ds_queue->split_queue_lock);
>>>>                 if (mapping) {
>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>>         if (!list_empty(&folio->_deferred_list)) {
>>>>                 ds_queue->split_queue_len--;
>>>>                 list_del_init(&folio->_deferred_list);
>>>> +               __folio_clear_partially_mapped(folio);
>>>
>>> is it possible to make things clearer by
>>>
>>>  if (folio_clear_partially_mapped)
>>>     __folio_clear_partially_mapped(folio);
>>>
>>> While writing without conditions isn't necessarily wrong, adding a condition
>>> will improve the readability of the code and enhance the clarity of my mTHP
>>> counters series. also help decrease smp cache sync if we can avoid
>>> unnecessary writing?
>>>
>>
>> Do you mean if(folio_test_partially_mapped(folio))?
>>
>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> 
> In the pmd-order case, the majority of folios are not partially mapped.
> Unconditional writes will trigger cache synchronization across all
> CPUs (related to the MESI protocol), making them more costly. By
> using conditional writes, such as "if(test) write," we can avoid
> most unnecessary writes, which is much more efficient. Additionally,
> we only need to manage nr_split_deferred when the condition
> is met. We are carefully evaluating all scenarios to determine
> if modifications to the partially_mapped flag are necessary.
> 


Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?

commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
Author: Usama Arif <usamaarif642@gmail.com>
Date:   Mon Aug 19 21:07:16 2024 +0100

    mm: Introduce a pageflag for partially mapped folios fix
    
    Test partially_mapped flag before clearing it. This should
    avoid unnecessary writes and will be needed in the nr_split_deferred
    series.
    
    Signed-off-by: Usama Arif <usamaarif642@gmail.com>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d67d3b3c1b2..ccde60aaaa0f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
        if (!list_empty(&folio->_deferred_list)) {
                ds_queue->split_queue_len--;
                list_del_init(&folio->_deferred_list);
-               __folio_clear_partially_mapped(folio);
+               if (folio_test_partially_mapped(folio))
+                       __folio_clear_partially_mapped(folio);
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
@@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
                } else {
                        /* We lost race with folio_put() */
                        list_del_init(&folio->_deferred_list);
-                       __folio_clear_partially_mapped(folio);
+                       if (folio_test_partially_mapped(folio))
+                               __folio_clear_partially_mapped(folio);
                        ds_queue->split_queue_len--;
                }
                if (!--sc->nr_to_scan)
Barry Song Aug. 19, 2024, 9:34 p.m. UTC | #6
On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 20:00, Barry Song wrote:
> > On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 19/08/2024 09:29, Barry Song wrote:
> >>> Hi Usama,
> >>>
> >>> I feel it is much better now! thanks!
> >>>
> >>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>
> >>>> Currently folio->_deferred_list is used to keep track of
> >>>> partially_mapped folios that are going to be split under memory
> >>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> >>>> by khugepaged are also going to be tracked using _deferred_list.
> >>>>
> >>>> This patch introduces a pageflag to be able to distinguish between
> >>>> partially mapped folios and others in the deferred_list at split time in
> >>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >>>> possible to distinguish between partially mapped folios and others in
> >>>> deferred_split_scan.
> >>>>
> >>>> Eventhough it introduces an extra flag to track if the folio is
> >>>> partially mapped, there is no functional change intended with this
> >>>> patch and the flag is not useful in this patch itself, it will
> >>>> become useful in the next patch when _deferred_list has non partially
> >>>> mapped folios.
> >>>>
> >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>> ---
> >>>>  include/linux/huge_mm.h    |  4 ++--
> >>>>  include/linux/page-flags.h | 11 +++++++++++
> >>>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
> >>>>  mm/internal.h              |  4 +++-
> >>>>  mm/memcontrol.c            |  3 ++-
> >>>>  mm/migrate.c               |  3 ++-
> >>>>  mm/page_alloc.c            |  5 +++--
> >>>>  mm/rmap.c                  |  5 +++--
> >>>>  mm/vmscan.c                |  3 ++-
> >>>>  9 files changed, 44 insertions(+), 17 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index 4c32058cacfe..969f11f360d2 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>>>  {
> >>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
> >>>>  }
> >>>> -void deferred_split_folio(struct folio *folio);
> >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>>>
> >>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>>>                 unsigned long address, bool freeze, struct folio *folio);
> >>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>>>  {
> >>>>         return 0;
> >>>>  }
> >>>> -static inline void deferred_split_folio(struct folio *folio) {}
> >>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
> >>>>         do { } while (0)
> >>>>
> >>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >>>> index a0a29bd092f8..c3bb0e0da581 100644
> >>>> --- a/include/linux/page-flags.h
> >>>> +++ b/include/linux/page-flags.h
> >>>> @@ -182,6 +182,7 @@ enum pageflags {
> >>>>         /* At least one page in this folio has the hwpoison flag set */
> >>>>         PG_has_hwpoisoned = PG_active,
> >>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >>>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >>>>  };
> >>>>
> >>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> >>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >>>>         ClearPageHead(page);
> >>>>  }
> >>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>> +/*
> >>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >>>> + * so its safe to use non-atomic set/clear.
> >>>> + */
> >>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>  #else
> >>>>  FOLIO_FLAG_FALSE(large_rmappable)
> >>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >>>>  #endif
> >>>>
> >>>>  #define PG_head_mask ((1UL << PG_head))
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 2d77b5d2291e..70ee49dfeaad 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>                          * page_deferred_list.
> >>>>                          */
> >>>>                         list_del_init(&folio->_deferred_list);
> >>>> +                       __folio_clear_partially_mapped(folio);
> >>>>                 }
> >>>>                 spin_unlock(&ds_queue->split_queue_lock);
> >>>>                 if (mapping) {
> >>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>>>         if (!list_empty(&folio->_deferred_list)) {
> >>>>                 ds_queue->split_queue_len--;
> >>>>                 list_del_init(&folio->_deferred_list);
> >>>> +               __folio_clear_partially_mapped(folio);
> >>>
> >>> is it possible to make things clearer by
> >>>
> >>>  if (folio_clear_partially_mapped)
> >>>     __folio_clear_partially_mapped(folio);
> >>>
> >>> While writing without conditions isn't necessarily wrong, adding a condition
> >>> will improve the readability of the code and enhance the clarity of my mTHP
> >>> counters series. also help decrease smp cache sync if we can avoid
> >>> unnecessary writing?
> >>>
> >>
> >> Do you mean if(folio_test_partially_mapped(folio))?
> >>
> >> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> >
> > In the pmd-order case, the majority of folios are not partially mapped.
> > Unconditional writes will trigger cache synchronization across all
> > CPUs (related to the MESI protocol), making them more costly. By
> > using conditional writes, such as "if(test) write," we can avoid
> > most unnecessary writes, which is much more efficient. Additionally,
> > we only need to manage nr_split_deferred when the condition
> > is met. We are carefully evaluating all scenarios to determine
> > if modifications to the partially_mapped flag are necessary.
> >
>
>
> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
>
> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> Author: Usama Arif <usamaarif642@gmail.com>
> Date:   Mon Aug 19 21:07:16 2024 +0100
>
>     mm: Introduce a pageflag for partially mapped folios fix
>
>     Test partially_mapped flag before clearing it. This should
>     avoid unnecessary writes and will be needed in the nr_split_deferred
>     series.
>
>     Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 5d67d3b3c1b2..ccde60aaaa0f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> -               __folio_clear_partially_mapped(folio);
> +               if (folio_test_partially_mapped(folio))
> +                       __folio_clear_partially_mapped(folio);
>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> -                       __folio_clear_partially_mapped(folio);
> +                       if (folio_test_partially_mapped(folio))
> +                               __folio_clear_partially_mapped(folio);
>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
>

Do we also need if (folio_test_partially_mapped(folio)) in
split_huge_page_to_list_to_order()?

I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
non-partially-mapped folios. To be future-proof, we might want to handle
both cases equally.

By the way, we might not need to clear the flag for a new folio. This differs
from the init_list, which is necessary. If a new folio has the partially_mapped
flag, it indicates that we failed to clear it when freeing the folio to
the buddy system, which is a bug we need to fix in the free path.

Thanks
Barry
Barry Song Aug. 19, 2024, 9:55 p.m. UTC | #7
On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
>
> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 19/08/2024 20:00, Barry Song wrote:
> > > On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>
> > >>
> > >>
> > >> On 19/08/2024 09:29, Barry Song wrote:
> > >>> Hi Usama,
> > >>>
> > >>> I feel it is much better now! thanks!
> > >>>
> > >>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>>>
> > >>>> Currently folio->_deferred_list is used to keep track of
> > >>>> partially_mapped folios that are going to be split under memory
> > >>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> > >>>> by khugepaged are also going to be tracked using _deferred_list.
> > >>>>
> > >>>> This patch introduces a pageflag to be able to distinguish between
> > >>>> partially mapped folios and others in the deferred_list at split time in
> > >>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > >>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > >>>> possible to distinguish between partially mapped folios and others in
> > >>>> deferred_split_scan.
> > >>>>
> > >>>> Eventhough it introduces an extra flag to track if the folio is
> > >>>> partially mapped, there is no functional change intended with this
> > >>>> patch and the flag is not useful in this patch itself, it will
> > >>>> become useful in the next patch when _deferred_list has non partially
> > >>>> mapped folios.
> > >>>>
> > >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >>>> ---
> > >>>>  include/linux/huge_mm.h    |  4 ++--
> > >>>>  include/linux/page-flags.h | 11 +++++++++++
> > >>>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
> > >>>>  mm/internal.h              |  4 +++-
> > >>>>  mm/memcontrol.c            |  3 ++-
> > >>>>  mm/migrate.c               |  3 ++-
> > >>>>  mm/page_alloc.c            |  5 +++--
> > >>>>  mm/rmap.c                  |  5 +++--
> > >>>>  mm/vmscan.c                |  3 ++-
> > >>>>  9 files changed, 44 insertions(+), 17 deletions(-)
> > >>>>
> > >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > >>>> index 4c32058cacfe..969f11f360d2 100644
> > >>>> --- a/include/linux/huge_mm.h
> > >>>> +++ b/include/linux/huge_mm.h
> > >>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> > >>>>  {
> > >>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
> > >>>>  }
> > >>>> -void deferred_split_folio(struct folio *folio);
> > >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> > >>>>
> > >>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > >>>>                 unsigned long address, bool freeze, struct folio *folio);
> > >>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> > >>>>  {
> > >>>>         return 0;
> > >>>>  }
> > >>>> -static inline void deferred_split_folio(struct folio *folio) {}
> > >>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > >>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
> > >>>>         do { } while (0)
> > >>>>
> > >>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > >>>> index a0a29bd092f8..c3bb0e0da581 100644
> > >>>> --- a/include/linux/page-flags.h
> > >>>> +++ b/include/linux/page-flags.h
> > >>>> @@ -182,6 +182,7 @@ enum pageflags {
> > >>>>         /* At least one page in this folio has the hwpoison flag set */
> > >>>>         PG_has_hwpoisoned = PG_active,
> > >>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > >>>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> > >>>>  };
> > >>>>
> > >>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> > >>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> > >>>>         ClearPageHead(page);
> > >>>>  }
> > >>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > >>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>> +/*
> > >>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> > >>>> + * so its safe to use non-atomic set/clear.
> > >>>> + */
> > >>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>>  #else
> > >>>>  FOLIO_FLAG_FALSE(large_rmappable)
> > >>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> > >>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> > >>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> > >>>>  #endif
> > >>>>
> > >>>>  #define PG_head_mask ((1UL << PG_head))
> > >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >>>> index 2d77b5d2291e..70ee49dfeaad 100644
> > >>>> --- a/mm/huge_memory.c
> > >>>> +++ b/mm/huge_memory.c
> > >>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >>>>                          * page_deferred_list.
> > >>>>                          */
> > >>>>                         list_del_init(&folio->_deferred_list);
> > >>>> +                       __folio_clear_partially_mapped(folio);
> > >>>>                 }
> > >>>>                 spin_unlock(&ds_queue->split_queue_lock);
> > >>>>                 if (mapping) {
> > >>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> > >>>>         if (!list_empty(&folio->_deferred_list)) {
> > >>>>                 ds_queue->split_queue_len--;
> > >>>>                 list_del_init(&folio->_deferred_list);
> > >>>> +               __folio_clear_partially_mapped(folio);
> > >>>
> > >>> is it possible to make things clearer by
> > >>>
> > >>>  if (folio_clear_partially_mapped)
> > >>>     __folio_clear_partially_mapped(folio);
> > >>>
> > >>> While writing without conditions isn't necessarily wrong, adding a condition
> > >>> will improve the readability of the code and enhance the clarity of my mTHP
> > >>> counters series. also help decrease smp cache sync if we can avoid
> > >>> unnecessary writing?
> > >>>
> > >>
> > >> Do you mean if(folio_test_partially_mapped(folio))?
> > >>
> > >> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> > >
> > > In the pmd-order case, the majority of folios are not partially mapped.
> > > Unconditional writes will trigger cache synchronization across all
> > > CPUs (related to the MESI protocol), making them more costly. By
> > > using conditional writes, such as "if(test) write," we can avoid
> > > most unnecessary writes, which is much more efficient. Additionally,
> > > we only need to manage nr_split_deferred when the condition
> > > is met. We are carefully evaluating all scenarios to determine
> > > if modifications to the partially_mapped flag are necessary.
> > >
> >
> >
> > Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
> >
> > commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> > Author: Usama Arif <usamaarif642@gmail.com>
> > Date:   Mon Aug 19 21:07:16 2024 +0100
> >
> >     mm: Introduce a pageflag for partially mapped folios fix
> >
> >     Test partially_mapped flag before clearing it. This should
> >     avoid unnecessary writes and will be needed in the nr_split_deferred
> >     series.
> >
> >     Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 5d67d3b3c1b2..ccde60aaaa0f 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >         if (!list_empty(&folio->_deferred_list)) {
> >                 ds_queue->split_queue_len--;
> >                 list_del_init(&folio->_deferred_list);
> > -               __folio_clear_partially_mapped(folio);
> > +               if (folio_test_partially_mapped(folio))
> > +                       __folio_clear_partially_mapped(folio);
> >         }
> >         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >  }
> > @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >                 } else {
> >                         /* We lost race with folio_put() */
> >                         list_del_init(&folio->_deferred_list);
> > -                       __folio_clear_partially_mapped(folio);
> > +                       if (folio_test_partially_mapped(folio))
> > +                               __folio_clear_partially_mapped(folio);
> >                         ds_queue->split_queue_len--;
> >                 }
> >                 if (!--sc->nr_to_scan)
> >
>
> Do we also need if (folio_test_partially_mapped(folio)) in
> split_huge_page_to_list_to_order()?
>
> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
> non-partially-mapped folios. To be future-proof, we might want to handle
> both cases equally.

we recall we also have a real case which can split entirely_mapped
folio:

mm: huge_memory: enable debugfs to split huge pages to any order
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4

>
> By the way, we might not need to clear the flag for a new folio. This differs
> from the init_list, which is necessary. If a new folio has the partially_mapped
> flag, it indicates that we failed to clear it when freeing the folio to
> the buddy system, which is a bug we need to fix in the free path.
>
> Thanks
> Barry
Usama Arif Aug. 20, 2024, 7:35 p.m. UTC | #8
On 19/08/2024 22:55, Barry Song wrote:
> On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
>>
>> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>
>>>
>>>
>>> On 19/08/2024 20:00, Barry Song wrote:
>>>> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 19/08/2024 09:29, Barry Song wrote:
>>>>>> Hi Usama,
>>>>>>
>>>>>> I feel it is much better now! thanks!
>>>>>>
>>>>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>>
>>>>>>> Currently folio->_deferred_list is used to keep track of
>>>>>>> partially_mapped folios that are going to be split under memory
>>>>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>>>>
>>>>>>> This patch introduces a pageflag to be able to distinguish between
>>>>>>> partially mapped folios and others in the deferred_list at split time in
>>>>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>>>>> possible to distinguish between partially mapped folios and others in
>>>>>>> deferred_split_scan.
>>>>>>>
>>>>>>> Eventhough it introduces an extra flag to track if the folio is
>>>>>>> partially mapped, there is no functional change intended with this
>>>>>>> patch and the flag is not useful in this patch itself, it will
>>>>>>> become useful in the next patch when _deferred_list has non partially
>>>>>>> mapped folios.
>>>>>>>
>>>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>>>>> ---
>>>>>>>  include/linux/huge_mm.h    |  4 ++--
>>>>>>>  include/linux/page-flags.h | 11 +++++++++++
>>>>>>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
>>>>>>>  mm/internal.h              |  4 +++-
>>>>>>>  mm/memcontrol.c            |  3 ++-
>>>>>>>  mm/migrate.c               |  3 ++-
>>>>>>>  mm/page_alloc.c            |  5 +++--
>>>>>>>  mm/rmap.c                  |  5 +++--
>>>>>>>  mm/vmscan.c                |  3 ++-
>>>>>>>  9 files changed, 44 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>> index 4c32058cacfe..969f11f360d2 100644
>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>>>>>  {
>>>>>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>>>>>>  }
>>>>>>> -void deferred_split_folio(struct folio *folio);
>>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>>>>
>>>>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>>>>>                 unsigned long address, bool freeze, struct folio *folio);
>>>>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>>>>>  {
>>>>>>>         return 0;
>>>>>>>  }
>>>>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>>>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>>>>>>         do { } while (0)
>>>>>>>
>>>>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>>>>> index a0a29bd092f8..c3bb0e0da581 100644
>>>>>>> --- a/include/linux/page-flags.h
>>>>>>> +++ b/include/linux/page-flags.h
>>>>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>>>>         /* At least one page in this folio has the hwpoison flag set */
>>>>>>>         PG_has_hwpoisoned = PG_active,
>>>>>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>>>>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>>>>>>>  };
>>>>>>>
>>>>>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>>>>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>>>>>>>         ClearPageHead(page);
>>>>>>>  }
>>>>>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>> +/*
>>>>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>>>>>>> + * so its safe to use non-atomic set/clear.
>>>>>>> + */
>>>>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>>  #else
>>>>>>>  FOLIO_FLAG_FALSE(large_rmappable)
>>>>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>>>>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>>>>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>>>>>>>  #endif
>>>>>>>
>>>>>>>  #define PG_head_mask ((1UL << PG_head))
>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>> index 2d77b5d2291e..70ee49dfeaad 100644
>>>>>>> --- a/mm/huge_memory.c
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>>>                          * page_deferred_list.
>>>>>>>                          */
>>>>>>>                         list_del_init(&folio->_deferred_list);
>>>>>>> +                       __folio_clear_partially_mapped(folio);
>>>>>>>                 }
>>>>>>>                 spin_unlock(&ds_queue->split_queue_lock);
>>>>>>>                 if (mapping) {
>>>>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>>>>>         if (!list_empty(&folio->_deferred_list)) {
>>>>>>>                 ds_queue->split_queue_len--;
>>>>>>>                 list_del_init(&folio->_deferred_list);
>>>>>>> +               __folio_clear_partially_mapped(folio);
>>>>>>
>>>>>> is it possible to make things clearer by
>>>>>>
>>>>>>  if (folio_clear_partially_mapped)
>>>>>>     __folio_clear_partially_mapped(folio);
>>>>>>
>>>>>> While writing without conditions isn't necessarily wrong, adding a condition
>>>>>> will improve the readability of the code and enhance the clarity of my mTHP
>>>>>> counters series. also help decrease smp cache sync if we can avoid
>>>>>> unnecessary writing?
>>>>>>
>>>>>
>>>>> Do you mean if(folio_test_partially_mapped(folio))?
>>>>>
>>>>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
>>>>
>>>> In the pmd-order case, the majority of folios are not partially mapped.
>>>> Unconditional writes will trigger cache synchronization across all
>>>> CPUs (related to the MESI protocol), making them more costly. By
>>>> using conditional writes, such as "if(test) write," we can avoid
>>>> most unnecessary writes, which is much more efficient. Additionally,
>>>> we only need to manage nr_split_deferred when the condition
>>>> is met. We are carefully evaluating all scenarios to determine
>>>> if modifications to the partially_mapped flag are necessary.
>>>>
>>>
>>>
>>> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
>>>
>>> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
>>> Author: Usama Arif <usamaarif642@gmail.com>
>>> Date:   Mon Aug 19 21:07:16 2024 +0100
>>>
>>>     mm: Introduce a pageflag for partially mapped folios fix
>>>
>>>     Test partially_mapped flag before clearing it. This should
>>>     avoid unnecessary writes and will be needed in the nr_split_deferred
>>>     series.
>>>
>>>     Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 5d67d3b3c1b2..ccde60aaaa0f 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>         if (!list_empty(&folio->_deferred_list)) {
>>>                 ds_queue->split_queue_len--;
>>>                 list_del_init(&folio->_deferred_list);
>>> -               __folio_clear_partially_mapped(folio);
>>> +               if (folio_test_partially_mapped(folio))
>>> +                       __folio_clear_partially_mapped(folio);
>>>         }
>>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>>  }
>>> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>>                 } else {
>>>                         /* We lost race with folio_put() */
>>>                         list_del_init(&folio->_deferred_list);
>>> -                       __folio_clear_partially_mapped(folio);
>>> +                       if (folio_test_partially_mapped(folio))
>>> +                               __folio_clear_partially_mapped(folio);
>>>                         ds_queue->split_queue_len--;
>>>                 }
>>>                 if (!--sc->nr_to_scan)
>>>
>>
>> Do we also need if (folio_test_partially_mapped(folio)) in
>> split_huge_page_to_list_to_order()?
>>
>> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
>> non-partially-mapped folios. To be future-proof, we might want to handle
>> both cases equally.
> 
> we recall we also have a real case which can split entirely_mapped
> folio:
> 
> mm: huge_memory: enable debugfs to split huge pages to any order
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4
> 
>>
>> By the way, we might not need to clear the flag for a new folio. This differs
>> from the init_list, which is necessary. If a new folio has the partially_mapped
>> flag, it indicates that we failed to clear it when freeing the folio to
>> the buddy system, which is a bug we need to fix in the free path.
>>
>> Thanks
>> Barry

I believe the below fixlet should address all concerns:


From 95492a51b1929ea274b4e5b78fc74e7736645d58 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Mon, 19 Aug 2024 21:07:16 +0100
Subject: [PATCH] mm: Introduce a pageflag for partially mapped folios fix

Test partially_mapped flag before clearing it. This should
avoid unnecessary writes and will be needed in the nr_split_deferred
series.
Also no need to clear partially_mapped prepping compound head, as it
should start with already being cleared.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/page-flags.h | 2 +-
 mm/huge_memory.c           | 9 ++++++---
 mm/internal.h              | 4 +---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c3bb0e0da581..f1602695daf2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1182,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
  */
 #define PAGE_FLAGS_SECOND						\
 	(0xffUL /* order */		| 1UL << PG_has_hwpoisoned |	\
-	 1UL << PG_large_rmappable)
+	 1UL << PG_large_rmappable	| 1UL << PG_partially_mapped)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d67d3b3c1b2..402b9d933de0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3422,7 +3422,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			 * page_deferred_list.
 			 */
 			list_del_init(&folio->_deferred_list);
-			__folio_clear_partially_mapped(folio);
+			if (folio_test_partially_mapped(folio))
+				__folio_clear_partially_mapped(folio);
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
 		if (mapping) {
@@ -3479,7 +3480,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		list_del_init(&folio->_deferred_list);
-		__folio_clear_partially_mapped(folio);
+		if (folio_test_partially_mapped(folio))
+			__folio_clear_partially_mapped(folio);
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
@@ -3610,7 +3612,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 		} else {
 			/* We lost race with folio_put() */
 			list_del_init(&folio->_deferred_list);
-			__folio_clear_partially_mapped(folio);
+			if (folio_test_partially_mapped(folio))
+				__folio_clear_partially_mapped(folio);
 			ds_queue->split_queue_len--;
 		}
 		if (!--sc->nr_to_scan)
diff --git a/mm/internal.h b/mm/internal.h
index 27cbb5365841..52f7fc4e8ac3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,10 +662,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
 	atomic_set(&folio->_entire_mapcount, -1);
 	atomic_set(&folio->_nr_pages_mapped, 0);
 	atomic_set(&folio->_pincount, 0);
-	if (order > 1) {
+	if (order > 1)
 		INIT_LIST_HEAD(&folio->_deferred_list);
-		__folio_clear_partially_mapped(folio);
-	}
 }
 
 static inline void prep_compound_tail(struct page *head, int tail_idx)
Barry Song Aug. 20, 2024, 9:30 p.m. UTC | #9
On Wed, Aug 21, 2024 at 7:35 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 22:55, Barry Song wrote:
> > On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
> >>
> >> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>
> >>>
> >>>
> >>> On 19/08/2024 20:00, Barry Song wrote:
> >>>> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 19/08/2024 09:29, Barry Song wrote:
> >>>>>> Hi Usama,
> >>>>>>
> >>>>>> I feel it is much better now! thanks!
> >>>>>>
> >>>>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>>>
> >>>>>>> Currently folio->_deferred_list is used to keep track of
> >>>>>>> partially_mapped folios that are going to be split under memory
> >>>>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> >>>>>>> by khugepaged are also going to be tracked using _deferred_list.
> >>>>>>>
> >>>>>>> This patch introduces a pageflag to be able to distinguish between
> >>>>>>> partially mapped folios and others in the deferred_list at split time in
> >>>>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >>>>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >>>>>>> possible to distinguish between partially mapped folios and others in
> >>>>>>> deferred_split_scan.
> >>>>>>>
> >>>>>>> Eventhough it introduces an extra flag to track if the folio is
> >>>>>>> partially mapped, there is no functional change intended with this
> >>>>>>> patch and the flag is not useful in this patch itself, it will
> >>>>>>> become useful in the next patch when _deferred_list has non partially
> >>>>>>> mapped folios.
> >>>>>>>
> >>>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>>>>> ---
> >>>>>>>  include/linux/huge_mm.h    |  4 ++--
> >>>>>>>  include/linux/page-flags.h | 11 +++++++++++
> >>>>>>>  mm/huge_memory.c           | 23 ++++++++++++++++-------
> >>>>>>>  mm/internal.h              |  4 +++-
> >>>>>>>  mm/memcontrol.c            |  3 ++-
> >>>>>>>  mm/migrate.c               |  3 ++-
> >>>>>>>  mm/page_alloc.c            |  5 +++--
> >>>>>>>  mm/rmap.c                  |  5 +++--
> >>>>>>>  mm/vmscan.c                |  3 ++-
> >>>>>>>  9 files changed, 44 insertions(+), 17 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>> index 4c32058cacfe..969f11f360d2 100644
> >>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>>>>>>  {
> >>>>>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
> >>>>>>>  }
> >>>>>>> -void deferred_split_folio(struct folio *folio);
> >>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>>>>>>
> >>>>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>>>>>>                 unsigned long address, bool freeze, struct folio *folio);
> >>>>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>>>>>>  {
> >>>>>>>         return 0;
> >>>>>>>  }
> >>>>>>> -static inline void deferred_split_folio(struct folio *folio) {}
> >>>>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>>>>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
> >>>>>>>         do { } while (0)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >>>>>>> index a0a29bd092f8..c3bb0e0da581 100644
> >>>>>>> --- a/include/linux/page-flags.h
> >>>>>>> +++ b/include/linux/page-flags.h
> >>>>>>> @@ -182,6 +182,7 @@ enum pageflags {
> >>>>>>>         /* At least one page in this folio has the hwpoison flag set */
> >>>>>>>         PG_has_hwpoisoned = PG_active,
> >>>>>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >>>>>>> +       PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >>>>>>>  };
> >>>>>>>
> >>>>>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> >>>>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >>>>>>>         ClearPageHead(page);
> >>>>>>>  }
> >>>>>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >>>>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>> +/*
> >>>>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >>>>>>> + * so its safe to use non-atomic set/clear.
> >>>>>>> + */
> >>>>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>>  #else
> >>>>>>>  FOLIO_FLAG_FALSE(large_rmappable)
> >>>>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >>>>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >>>>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >>>>>>>  #endif
> >>>>>>>
> >>>>>>>  #define PG_head_mask ((1UL << PG_head))
> >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>>> index 2d77b5d2291e..70ee49dfeaad 100644
> >>>>>>> --- a/mm/huge_memory.c
> >>>>>>> +++ b/mm/huge_memory.c
> >>>>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>>>>                          * page_deferred_list.
> >>>>>>>                          */
> >>>>>>>                         list_del_init(&folio->_deferred_list);
> >>>>>>> +                       __folio_clear_partially_mapped(folio);
> >>>>>>>                 }
> >>>>>>>                 spin_unlock(&ds_queue->split_queue_lock);
> >>>>>>>                 if (mapping) {
> >>>>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>>>>>>         if (!list_empty(&folio->_deferred_list)) {
> >>>>>>>                 ds_queue->split_queue_len--;
> >>>>>>>                 list_del_init(&folio->_deferred_list);
> >>>>>>> +               __folio_clear_partially_mapped(folio);
> >>>>>>
> >>>>>> is it possible to make things clearer by
> >>>>>>
> >>>>>>  if (folio_clear_partially_mapped)
> >>>>>>     __folio_clear_partially_mapped(folio);
> >>>>>>
> >>>>>> While writing without conditions isn't necessarily wrong, adding a condition
> >>>>>> will improve the readability of the code and enhance the clarity of my mTHP
> >>>>>> counters series. also help decrease smp cache sync if we can avoid
> >>>>>> unnecessary writing?
> >>>>>>
> >>>>>
> >>>>> Do you mean if(folio_test_partially_mapped(folio))?
> >>>>>
> >>>>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> >>>>
> >>>> In the pmd-order case, the majority of folios are not partially mapped.
> >>>> Unconditional writes will trigger cache synchronization across all
> >>>> CPUs (related to the MESI protocol), making them more costly. By
> >>>> using conditional writes, such as "if(test) write," we can avoid
> >>>> most unnecessary writes, which is much more efficient. Additionally,
> >>>> we only need to manage nr_split_deferred when the condition
> >>>> is met. We are carefully evaluating all scenarios to determine
> >>>> if modifications to the partially_mapped flag are necessary.
> >>>>
> >>>
> >>>
> >>> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
> >>>
> >>> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> >>> Author: Usama Arif <usamaarif642@gmail.com>
> >>> Date:   Mon Aug 19 21:07:16 2024 +0100
> >>>
> >>>     mm: Introduce a pageflag for partially mapped folios fix
> >>>
> >>>     Test partially_mapped flag before clearing it. This should
> >>>     avoid unnecessary writes and will be needed in the nr_split_deferred
> >>>     series.
> >>>
> >>>     Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index 5d67d3b3c1b2..ccde60aaaa0f 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>>         if (!list_empty(&folio->_deferred_list)) {
> >>>                 ds_queue->split_queue_len--;
> >>>                 list_del_init(&folio->_deferred_list);
> >>> -               __folio_clear_partially_mapped(folio);
> >>> +               if (folio_test_partially_mapped(folio))
> >>> +                       __folio_clear_partially_mapped(folio);
> >>>         }
> >>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>>  }
> >>> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >>>                 } else {
> >>>                         /* We lost race with folio_put() */
> >>>                         list_del_init(&folio->_deferred_list);
> >>> -                       __folio_clear_partially_mapped(folio);
> >>> +                       if (folio_test_partially_mapped(folio))
> >>> +                               __folio_clear_partially_mapped(folio);
> >>>                         ds_queue->split_queue_len--;
> >>>                 }
> >>>                 if (!--sc->nr_to_scan)
> >>>
> >>
> >> Do we also need if (folio_test_partially_mapped(folio)) in
> >> split_huge_page_to_list_to_order()?
> >>
> >> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
> >> non-partially-mapped folios. To be future-proof, we might want to handle
> >> both cases equally.
> >
> > we recall we also have a real case which can split entirely_mapped
> > folio:
> >
> > mm: huge_memory: enable debugfs to split huge pages to any order
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4
> >
> >>
> >> By the way, we might not need to clear the flag for a new folio. This differs
> >> from the init_list, which is necessary. If a new folio has the partially_mapped
> >> flag, it indicates that we failed to clear it when freeing the folio to
> >> the buddy system, which is a bug we need to fix in the free path.
> >>
> >> Thanks
> >> Barry
>
> I believe the below fixlet should address all concerns:

Hi Usama,
thanks! I can't judge if we need this partially_mapped flag. but if we
need, the code
looks correct to me. I'd like to leave this to David and other experts to ack.

an alternative approach might be two lists? one for entirely_mapped,
the other one
for split_deferred. also seems ugly ?

On the other hand, when we want to extend your patchset to mTHP other than PMD-
order, will the only deferred_list create huge lock contention while
adding or removing
folios from it?

>
>
> From 95492a51b1929ea274b4e5b78fc74e7736645d58 Mon Sep 17 00:00:00 2001
> From: Usama Arif <usamaarif642@gmail.com>
> Date: Mon, 19 Aug 2024 21:07:16 +0100
> Subject: [PATCH] mm: Introduce a pageflag for partially mapped folios fix
>
> Test partially_mapped flag before clearing it. This should
> avoid unnecessary writes and will be needed in the nr_split_deferred
> series.
> Also no need to clear partially_mapped prepping compound head, as it
> should start with already being cleared.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/page-flags.h | 2 +-
>  mm/huge_memory.c           | 9 ++++++---
>  mm/internal.h              | 4 +---
>  3 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index c3bb0e0da581..f1602695daf2 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -1182,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
>   */
>  #define PAGE_FLAGS_SECOND                                              \
>         (0xffUL /* order */             | 1UL << PG_has_hwpoisoned |    \
> -        1UL << PG_large_rmappable)
> +        1UL << PG_large_rmappable      | 1UL << PG_partially_mapped)
>
>  #define PAGE_FLAGS_PRIVATE                             \
>         (1UL << PG_private | 1UL << PG_private_2)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 5d67d3b3c1b2..402b9d933de0 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3422,7 +3422,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                          * page_deferred_list.
>                          */
>                         list_del_init(&folio->_deferred_list);
> -                       __folio_clear_partially_mapped(folio);
> +                       if (folio_test_partially_mapped(folio))
> +                               __folio_clear_partially_mapped(folio);
>                 }
>                 spin_unlock(&ds_queue->split_queue_lock);
>                 if (mapping) {
> @@ -3479,7 +3480,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> -               __folio_clear_partially_mapped(folio);
> +               if (folio_test_partially_mapped(folio))
> +                       __folio_clear_partially_mapped(folio);
>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
> @@ -3610,7 +3612,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> -                       __folio_clear_partially_mapped(folio);
> +                       if (folio_test_partially_mapped(folio))
> +                               __folio_clear_partially_mapped(folio);
>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
> diff --git a/mm/internal.h b/mm/internal.h
> index 27cbb5365841..52f7fc4e8ac3 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,10 +662,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>         atomic_set(&folio->_entire_mapcount, -1);
>         atomic_set(&folio->_nr_pages_mapped, 0);
>         atomic_set(&folio->_pincount, 0);
> -       if (order > 1) {
> +       if (order > 1)
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> -               __folio_clear_partially_mapped(folio);
> -       }
>  }
>
>  static inline void prep_compound_tail(struct page *head, int tail_idx)
> --
> 2.43.5
>
>

Thanks
Barry
Usama Arif Aug. 21, 2024, 7:04 p.m. UTC | #10
On 20/08/2024 17:30, Barry Song wrote:

> Hi Usama,
> thanks! I can't judge if we need this partially_mapped flag. but if we
> need, the code
> looks correct to me. I'd like to leave this to David and other experts to ack.
> 

Thanks for the reviews!

> an alternative approach might be two lists? one for entirely_mapped,
> the other one
> for split_deferred. also seems ugly ?
> 

That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.

> On the other hand, when we want to extend your patchset to mTHP other than PMD-
> order, will the only deferred_list create huge lock contention while
> adding or removing
> folios from it?
> 

Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.

Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.

I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.

Thanks
Barry Song Aug. 21, 2024, 9:25 p.m. UTC | #11
On Thu, Aug 22, 2024 at 3:04 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 20/08/2024 17:30, Barry Song wrote:
>
> > Hi Usama,
> > thanks! I can't judge if we need this partially_mapped flag. but if we
> > need, the code
> > looks correct to me. I'd like to leave this to David and other experts to ack.
> >
>
> Thanks for the reviews!
>
> > an alternative approach might be two lists? one for entirely_mapped,
> > the other one
> > for split_deferred. also seems ugly ?
> >
>
> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>

Thanks for the explanation; I missed the earlier discussion.

> > On the other hand, when we want to extend your patchset to mTHP other than PMD-
> > order, will the only deferred_list create huge lock contention while
> > adding or removing
> > folios from it?
> >
>
> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>

Right.

> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>
> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>

Agreed. I suspect this will be also useful for larger mTHPs like 1M and 512KB.
While lock contention might be a concern, it shouldn't be a barrier to
proceeding
with your patchset, which only supports PMD-order.

> Thanks
>

Thanks
Barry
Bang Li Sept. 1, 2024, 12:48 p.m. UTC | #12
hi, Usama

On 2024/8/22 3:04, Usama Arif wrote:

>
> On 20/08/2024 17:30, Barry Song wrote:
>
>> Hi Usama,
>> thanks! I can't judge if we need this partially_mapped flag. but if we
>> need, the code
>> looks correct to me. I'd like to leave this to David and other experts to ack.
>>
> Thanks for the reviews!
>
>> an alternative approach might be two lists? one for entirely_mapped,
>> the other one
>> for split_deferred. also seems ugly ?
>>
> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>
>> On the other hand, when we want to extend your patchset to mTHP other than PMD-
>> order, will the only deferred_list create huge lock contention while
>> adding or removing
>> folios from it?
>>
> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>
> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>
> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>
> Thanks

Below is the core code snippet to support "split underused mTHP". Can we extend the
khugepaged_max_ptes_none value to mthp and keep its semantics unchanged? With a small
modification, Only folios with page numbers greater than khugepaged_max_ptes_none - 1
can be added to the deferred_split list and can be split. What do you think?

diff --git a/mm/memory.c b/mm/memory.c
index b95fce7d190f..ef503958d6a0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4789,6 +4789,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         }

         folio_ref_add(folio, nr_pages - 1);
+       if (nr_pages > 1 && nr_pages > khugepaged_max_ptes_none - 1)
+               deferred_split_folio(folio, false);
         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
         count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
         folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);

shmem THP has the same memory expansion problem when the shmem_enabled configuration is
set to always. In my opinion, it is necessary to support "split underused shmem THP",
but I am not sure if there is any gap in the implementation?

Bang
Thanks
Usama Arif Sept. 2, 2024, 10:09 a.m. UTC | #13
On 01/09/2024 08:48, Bang Li wrote:
> hi, Usama
> 
> On 2024/8/22 3:04, Usama Arif wrote:
> 
>>
>> On 20/08/2024 17:30, Barry Song wrote:
>>
>>> Hi Usama,
>>> thanks! I can't judge if we need this partially_mapped flag. but if we
>>> need, the code
>>> looks correct to me. I'd like to leave this to David and other experts to ack.
>>>
>> Thanks for the reviews!
>>
>>> an alternative approach might be two lists? one for entirely_mapped,
>>> the other one
>>> for split_deferred. also seems ugly ?
>>>
>> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>>
>>> On the other hand, when we want to extend your patchset to mTHP other than PMD-
>>> order, will the only deferred_list create huge lock contention while
>>> adding or removing
>>> folios from it?
>>>
>> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>>
>> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>>
>> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>>
>> Thanks
> 
> Below is the core code snippet to support "split underused mTHP". Can we extend the
> khugepaged_max_ptes_none value to mthp and keep its semantics unchanged? With a small
> modification, Only folios with page numbers greater than khugepaged_max_ptes_none - 1
> can be added to the deferred_split list and can be split. What do you think?
> 

hmm, so I believe its not as simple as that.

First mTHP support would need to be added to khugepaged. The entire khugepaged code needs
to be audited for it and significantly tested. If you just look at all the instances of
HPAGE_PMD_NR in khugepaged.c, you will see it will be a significant change and needs to
be a series of its own.

Also, different values of max_ptes_none can have different significance for mTHP sizes.
max_ptes_none of 200 does not mean anything for 512K and below, it means 78% of a 1M mTHP
and 39% of a 2M THP. We might want different max_ptes_none for each mTHP if we were to do
this.

The benefit of splitting underused mTHPs of lower order might not be worth the work
needed to split. Someone needs to experiment with different workloads and see some
improvement for these lower orders.

So probably a lot more than the below diff is needed for mTHP support!


> diff --git a/mm/memory.c b/mm/memory.c
> index b95fce7d190f..ef503958d6a0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4789,6 +4789,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>         }
> 
>         folio_ref_add(folio, nr_pages - 1);
> +       if (nr_pages > 1 && nr_pages > khugepaged_max_ptes_none - 1)
> +               deferred_split_folio(folio, false);
>         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>         count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
>         folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
> 
> shmem THP has the same memory expansion problem when the shmem_enabled configuration is
> set to always. In my opinion, it is necessary to support "split underused shmem THP",
> but I am not sure if there is any gap in the implementation?
> 
> Bang
> Thanks
>
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c32058cacfe..969f11f360d2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -321,7 +321,7 @@  static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct folio *folio);
@@ -495,7 +495,7 @@  static inline int split_huge_page(struct page *page)
 {
 	return 0;
 }
-static inline void deferred_split_folio(struct folio *folio) {}
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a0a29bd092f8..c3bb0e0da581 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,6 +182,7 @@  enum pageflags {
 	/* At least one page in this folio has the hwpoison flag set */
 	PG_has_hwpoisoned = PG_active,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
+	PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -861,8 +862,18 @@  static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+/*
+ * PG_partially_mapped is protected by deferred_split split_queue_lock,
+ * so its safe to use non-atomic set/clear.
+ */
+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 FOLIO_FLAG_FALSE(large_rmappable)
+FOLIO_TEST_FLAG_FALSE(partially_mapped)
+__FOLIO_SET_FLAG_NOOP(partially_mapped)
+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
 #endif
 
 #define PG_head_mask ((1UL << PG_head))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2d77b5d2291e..70ee49dfeaad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3398,6 +3398,7 @@  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			 * page_deferred_list.
 			 */
 			list_del_init(&folio->_deferred_list);
+			__folio_clear_partially_mapped(folio);
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
 		if (mapping) {
@@ -3454,11 +3455,13 @@  void __folio_undo_large_rmappable(struct folio *folio)
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		list_del_init(&folio->_deferred_list);
+		__folio_clear_partially_mapped(folio);
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3486,14 +3489,19 @@  void deferred_split_folio(struct folio *folio)
 	if (folio_test_swapcache(folio))
 		return;
 
-	if (!list_empty(&folio->_deferred_list))
-		return;
-
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	if (partially_mapped) {
+		if (!folio_test_partially_mapped(folio)) {
+			__folio_set_partially_mapped(folio);
+			if (folio_test_pmd_mappable(folio))
+				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		}
+	} else {
+		/* partially mapped folios cannot become non-partially mapped */
+		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+	}
 	if (list_empty(&folio->_deferred_list)) {
-		if (folio_test_pmd_mappable(folio))
-			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
@@ -3542,6 +3550,7 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 		} else {
 			/* We lost race with folio_put() */
 			list_del_init(&folio->_deferred_list);
+			__folio_clear_partially_mapped(folio);
 			ds_queue->split_queue_len--;
 		}
 		if (!--sc->nr_to_scan)
diff --git a/mm/internal.h b/mm/internal.h
index 52f7fc4e8ac3..27cbb5365841 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,8 +662,10 @@  static inline void prep_compound_head(struct page *page, unsigned int order)
 	atomic_set(&folio->_entire_mapcount, -1);
 	atomic_set(&folio->_nr_pages_mapped, 0);
 	atomic_set(&folio->_pincount, 0);
-	if (order > 1)
+	if (order > 1) {
 		INIT_LIST_HEAD(&folio->_deferred_list);
+		__folio_clear_partially_mapped(folio);
+	}
 }
 
 static inline void prep_compound_tail(struct page *head, int tail_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1ffd2950393..0fd95daecf9a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4669,7 +4669,8 @@  static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
 			!folio_test_hugetlb(folio) &&
-			!list_empty(&folio->_deferred_list), folio);
+			!list_empty(&folio->_deferred_list) &&
+			folio_test_partially_mapped(folio), folio);
 
 	/*
 	 * Nobody should be changing or seriously looking at
diff --git a/mm/migrate.c b/mm/migrate.c
index 2d2e65d69427..ef4a732f22b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1735,7 +1735,8 @@  static int migrate_pages_batch(struct list_head *from,
 			 * use _deferred_list.
 			 */
 			if (nr_pages > 2 &&
-			   !list_empty(&folio->_deferred_list)) {
+			   !list_empty(&folio->_deferred_list) &&
+			   folio_test_partially_mapped(folio)) {
 				if (!try_split_folio(folio, split_folios, mode)) {
 					nr_failed++;
 					stats->nr_thp_failed += is_thp;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 408ef3d25cf5..a145c550dd2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -957,8 +957,9 @@  static int free_tail_page_prepare(struct page *head_page, struct page *page)
 		break;
 	case 2:
 		/* the second tail page: deferred_list overlaps ->mapping */
-		if (unlikely(!list_empty(&folio->_deferred_list))) {
-			bad_page(page, "on deferred list");
+		if (unlikely(!list_empty(&folio->_deferred_list) &&
+		    folio_test_partially_mapped(folio))) {
+			bad_page(page, "partially mapped folio on deferred list");
 			goto out;
 		}
 		break;
diff --git a/mm/rmap.c b/mm/rmap.c
index a6b9cd0b2b18..4c330635aa4e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1578,8 +1578,9 @@  static __always_inline void __folio_remove_rmap(struct folio *folio,
 	 * Check partially_mapped first to ensure it is a large folio.
 	 */
 	if (partially_mapped && folio_test_anon(folio) &&
-	    list_empty(&folio->_deferred_list))
-		deferred_split_folio(folio);
+	    !folio_test_partially_mapped(folio))
+		deferred_split_folio(folio, true);
+
 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
 
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 25e43bb3b574..25f4e8403f41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1233,7 +1233,8 @@  static unsigned int shrink_folio_list(struct list_head *folio_list,
 					 * Split partially mapped folios right away.
 					 * We can free the unmapped pages without IO.
 					 */
-					if (data_race(!list_empty(&folio->_deferred_list)) &&
+					if (data_race(!list_empty(&folio->_deferred_list) &&
+					    folio_test_partially_mapped(folio)) &&
 					    split_folio_to_list(folio, folio_list))
 						goto activate_locked;
 				}