diff mbox series

[v5,6/9] mm: swap: allow cache reclaim to skip slot cache

Message ID 20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org (mailing list archive)
State New
Headers show
Series mm: swap: mTHP swap allocator base on swap cluster order | expand

Commit Message

Chris Li July 31, 2024, 6:49 a.m. UTC
From: Kairui Song <kasong@tencent.com>

Currently we free the reclaimed slots through slot cache even
if the slot is required to be empty immediately. As a result
the reclaim caller will see the slot still occupied even after a
successful reclaim, and need to keep reclaiming until slot cache
get flushed. This caused ineffective or over reclaim when SWAP is
under stress.

So introduce a new flag allowing the slot to be emptied bypassing
the slot cache.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 109 insertions(+), 43 deletions(-)

Comments

Barry Song Aug. 3, 2024, 10:38 a.m. UTC | #1
On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Currently we free the reclaimed slots through slot cache even
> if the slot is required to be empty immediately. As a result
> the reclaim caller will see the slot still occupied even after a
> successful reclaim, and need to keep reclaiming until slot cache
> get flushed. This caused ineffective or over reclaim when SWAP is
> under stress.
>
> So introduce a new flag allowing the slot to be emptied bypassing
> the slot cache.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 109 insertions(+), 43 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 9b63b2262cc2..4c0fc0409d3c 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -53,8 +53,15 @@
>  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
>                                  unsigned char);
>  static void free_swap_count_continuations(struct swap_info_struct *);
> +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
> +                                 unsigned int nr_pages);
>  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
>                              unsigned int nr_entries);
> +static bool folio_swapcache_freeable(struct folio *folio);
> +static struct swap_cluster_info *lock_cluster_or_swap_info(
> +               struct swap_info_struct *si, unsigned long offset);
> +static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> +                                       struct swap_cluster_info *ci);
>
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
> @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent)
>   * corresponding page
>   */
>  #define TTRS_UNMAPPED          0x2
> -/* Reclaim the swap entry if swap is getting full*/
> +/* Reclaim the swap entry if swap is getting full */
>  #define TTRS_FULL              0x4
> +/* Reclaim directly, bypass the slot cache and don't touch device lock */
> +#define TTRS_DIRECT            0x8
> +
> +static bool swap_is_has_cache(struct swap_info_struct *si,
> +                             unsigned long offset, int nr_pages)
> +{
> +       unsigned char *map = si->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +
> +       do {
> +               VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
> +               if (*map != SWAP_HAS_CACHE)
> +                       return false;
> +       } while (++map < map_end);
> +
> +       return true;
> +}
>
>  /*
>   * returns number of pages in the folio that backs the swap entry. If positive,
> @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>                                  unsigned long offset, unsigned long flags)
>  {
>         swp_entry_t entry = swp_entry(si->type, offset);
> +       struct address_space *address_space = swap_address_space(entry);
> +       struct swap_cluster_info *ci;
>         struct folio *folio;
> -       int ret = 0;
> +       int ret, nr_pages;
> +       bool need_reclaim;
>
> -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> +       folio = filemap_get_folio(address_space, swap_cache_index(entry));
>         if (IS_ERR(folio))
>                 return 0;
> +
> +       /* offset could point to the middle of a large folio */
> +       entry = folio->swap;
> +       offset = swp_offset(entry);
> +       nr_pages = folio_nr_pages(folio);
> +       ret = -nr_pages;
> +
>         /*
>          * When this function is called from scan_swap_map_slots() and it's
>          * called by vmscan.c at reclaiming folios. So we hold a folio lock
> @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>          * case and you should use folio_free_swap() with explicit folio_lock()
>          * in usual operations.
>          */
> -       if (folio_trylock(folio)) {
> -               if ((flags & TTRS_ANYWAY) ||
> -                   ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> -                   ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
> -                       ret = folio_free_swap(folio);
> -               folio_unlock(folio);
> +       if (!folio_trylock(folio))
> +               goto out;
> +
> +       need_reclaim = ((flags & TTRS_ANYWAY) ||
> +                       ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> +                       ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
> +       if (!need_reclaim || !folio_swapcache_freeable(folio))
> +               goto out_unlock;
> +
> +       /*
> +        * It's safe to delete the folio from swap cache only if the folio's
> +        * swap_map is HAS_CACHE only, which means the slots have no page table
> +        * reference or pending writeback, and can't be allocated to others.
> +        */
> +       ci = lock_cluster_or_swap_info(si, offset);
> +       need_reclaim = swap_is_has_cache(si, offset, nr_pages);
> +       unlock_cluster_or_swap_info(si, ci);
> +       if (!need_reclaim)
> +               goto out_unlock;
> +
> +       if (!(flags & TTRS_DIRECT)) {
> +               /* Free through slot cache */
> +               delete_from_swap_cache(folio);
> +               folio_set_dirty(folio);
> +               ret = nr_pages;
> +               goto out_unlock;
>         }
> -       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> +
> +       xa_lock_irq(&address_space->i_pages);
> +       __delete_from_swap_cache(folio, entry, NULL);
> +       xa_unlock_irq(&address_space->i_pages);
> +       folio_ref_sub(folio, nr_pages);
> +       folio_set_dirty(folio);
> +
> +       spin_lock(&si->lock);
> +       /* Only sinple page folio can be backed by zswap */
> +       if (!nr_pages)
> +               zswap_invalidate(entry);

I am trying to figure out if I am mad :-)  Does nr_pages == 0 means single
page folio?

> +       swap_entry_range_free(si, entry, nr_pages);
> +       spin_unlock(&si->lock);
> +       ret = nr_pages;
> +out_unlock:
> +       folio_unlock(folio);
> +out:
>         folio_put(folio);
>         return ret;
>  }
> @@ -903,7 +973,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
>         if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
>                 int swap_was_freed;
>                 spin_unlock(&si->lock);
> -               swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
> +               swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
>                 spin_lock(&si->lock);
>                 /* entry was freed successfully, try to use this again */
>                 if (swap_was_freed > 0)
> @@ -1340,9 +1410,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
>         unsigned long offset = swp_offset(entry);
>         struct swap_cluster_info *ci;
>         struct swap_info_struct *si;
> -       unsigned char *map;
> -       unsigned int i, free_entries = 0;
> -       unsigned char val;
>         int size = 1 << swap_entry_order(folio_order(folio));
>
>         si = _swap_info_get(entry);
> @@ -1350,23 +1417,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
>                 return;
>
>         ci = lock_cluster_or_swap_info(si, offset);
> -       if (size > 1) {
> -               map = si->swap_map + offset;
> -               for (i = 0; i < size; i++) {
> -                       val = map[i];
> -                       VM_BUG_ON(!(val & SWAP_HAS_CACHE));
> -                       if (val == SWAP_HAS_CACHE)
> -                               free_entries++;
> -               }
> -               if (free_entries == size) {
> -                       unlock_cluster_or_swap_info(si, ci);
> -                       spin_lock(&si->lock);
> -                       swap_entry_range_free(si, entry, size);
> -                       spin_unlock(&si->lock);
> -                       return;
> -               }
> +       if (size > 1 && swap_is_has_cache(si, offset, size)) {
> +               unlock_cluster_or_swap_info(si, ci);
> +               spin_lock(&si->lock);
> +               swap_entry_range_free(si, entry, size);
> +               spin_unlock(&si->lock);
> +               return;
>         }
> -       for (i = 0; i < size; i++, entry.val++) {
> +       for (int i = 0; i < size; i++, entry.val++) {
>                 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
>                         unlock_cluster_or_swap_info(si, ci);
>                         free_swap_slot(entry);
> @@ -1526,16 +1584,7 @@ static bool folio_swapped(struct folio *folio)
>         return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
>  }
>
> -/**
> - * folio_free_swap() - Free the swap space used for this folio.
> - * @folio: The folio to remove.
> - *
> - * If swap is getting full, or if there are no more mappings of this folio,
> - * then call folio_free_swap to free its swap space.
> - *
> - * Return: true if we were able to release the swap space.
> - */
> -bool folio_free_swap(struct folio *folio)
> +static bool folio_swapcache_freeable(struct folio *folio)
>  {
>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>
> @@ -1543,8 +1592,6 @@ bool folio_free_swap(struct folio *folio)
>                 return false;
>         if (folio_test_writeback(folio))
>                 return false;
> -       if (folio_swapped(folio))
> -               return false;
>
>         /*
>          * Once hibernation has begun to create its image of memory,
> @@ -1564,6 +1611,25 @@ bool folio_free_swap(struct folio *folio)
>         if (pm_suspended_storage())
>                 return false;
>
> +       return true;
> +}
> +
> +/**
> + * folio_free_swap() - Free the swap space used for this folio.
> + * @folio: The folio to remove.
> + *
> + * If swap is getting full, or if there are no more mappings of this folio,
> + * then call folio_free_swap to free its swap space.
> + *
> + * Return: true if we were able to release the swap space.
> + */
> +bool folio_free_swap(struct folio *folio)
> +{
> +       if (!folio_swapcache_freeable(folio))
> +               return false;
> +       if (folio_swapped(folio))
> +               return false;
> +
>         delete_from_swap_cache(folio);
>         folio_set_dirty(folio);
>         return true;
> @@ -1640,7 +1706,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>                          * to the next boundary.
>                          */
>                         nr = __try_to_reclaim_swap(si, offset,
> -                                             TTRS_UNMAPPED | TTRS_FULL);
> +                                                  TTRS_UNMAPPED | TTRS_FULL);
>                         if (nr == 0)
>                                 nr = 1;
>                         else if (nr < 0)
>
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>
Kairui Song Aug. 3, 2024, 12:18 p.m. UTC | #2
On Sat, Aug 3, 2024 at 6:39 PM Barry Song <baohua@kernel.org> wrote:
>
> On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently we free the reclaimed slots through slot cache even
> > if the slot is required to be empty immediately. As a result
> > the reclaim caller will see the slot still occupied even after a
> > successful reclaim, and need to keep reclaiming until slot cache
> > get flushed. This caused ineffective or over reclaim when SWAP is
> > under stress.
> >
> > So introduce a new flag allowing the slot to be emptied bypassing
> > the slot cache.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 109 insertions(+), 43 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 9b63b2262cc2..4c0fc0409d3c 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -53,8 +53,15 @@
> >  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> >                                  unsigned char);
> >  static void free_swap_count_continuations(struct swap_info_struct *);
> > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
> > +                                 unsigned int nr_pages);
> >  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
> >                              unsigned int nr_entries);
> > +static bool folio_swapcache_freeable(struct folio *folio);
> > +static struct swap_cluster_info *lock_cluster_or_swap_info(
> > +               struct swap_info_struct *si, unsigned long offset);
> > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> > +                                       struct swap_cluster_info *ci);
> >
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent)
> >   * corresponding page
> >   */
> >  #define TTRS_UNMAPPED          0x2
> > -/* Reclaim the swap entry if swap is getting full*/
> > +/* Reclaim the swap entry if swap is getting full */
> >  #define TTRS_FULL              0x4
> > +/* Reclaim directly, bypass the slot cache and don't touch device lock */
> > +#define TTRS_DIRECT            0x8
> > +
> > +static bool swap_is_has_cache(struct swap_info_struct *si,
> > +                             unsigned long offset, int nr_pages)
> > +{
> > +       unsigned char *map = si->swap_map + offset;
> > +       unsigned char *map_end = map + nr_pages;
> > +
> > +       do {
> > +               VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
> > +               if (*map != SWAP_HAS_CACHE)
> > +                       return false;
> > +       } while (++map < map_end);
> > +
> > +       return true;
> > +}
> >
> >  /*
> >   * returns number of pages in the folio that backs the swap entry. If positive,
> > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >                                  unsigned long offset, unsigned long flags)
> >  {
> >         swp_entry_t entry = swp_entry(si->type, offset);
> > +       struct address_space *address_space = swap_address_space(entry);
> > +       struct swap_cluster_info *ci;
> >         struct folio *folio;
> > -       int ret = 0;
> > +       int ret, nr_pages;
> > +       bool need_reclaim;
> >
> > -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> > +       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> >         if (IS_ERR(folio))
> >                 return 0;
> > +
> > +       /* offset could point to the middle of a large folio */
> > +       entry = folio->swap;
> > +       offset = swp_offset(entry);
> > +       nr_pages = folio_nr_pages(folio);
> > +       ret = -nr_pages;
> > +
> >         /*
> >          * When this function is called from scan_swap_map_slots() and it's
> >          * called by vmscan.c at reclaiming folios. So we hold a folio lock
> > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >          * case and you should use folio_free_swap() with explicit folio_lock()
> >          * in usual operations.
> >          */
> > -       if (folio_trylock(folio)) {
> > -               if ((flags & TTRS_ANYWAY) ||
> > -                   ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > -                   ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
> > -                       ret = folio_free_swap(folio);
> > -               folio_unlock(folio);
> > +       if (!folio_trylock(folio))
> > +               goto out;
> > +
> > +       need_reclaim = ((flags & TTRS_ANYWAY) ||
> > +                       ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > +                       ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
> > +       if (!need_reclaim || !folio_swapcache_freeable(folio))
> > +               goto out_unlock;
> > +
> > +       /*
> > +        * It's safe to delete the folio from swap cache only if the folio's
> > +        * swap_map is HAS_CACHE only, which means the slots have no page table
> > +        * reference or pending writeback, and can't be allocated to others.
> > +        */
> > +       ci = lock_cluster_or_swap_info(si, offset);
> > +       need_reclaim = swap_is_has_cache(si, offset, nr_pages);
> > +       unlock_cluster_or_swap_info(si, ci);
> > +       if (!need_reclaim)
> > +               goto out_unlock;
> > +
> > +       if (!(flags & TTRS_DIRECT)) {
> > +               /* Free through slot cache */
> > +               delete_from_swap_cache(folio);
> > +               folio_set_dirty(folio);
> > +               ret = nr_pages;
> > +               goto out_unlock;
> >         }
> > -       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> > +
> > +       xa_lock_irq(&address_space->i_pages);
> > +       __delete_from_swap_cache(folio, entry, NULL);
> > +       xa_unlock_irq(&address_space->i_pages);
> > +       folio_ref_sub(folio, nr_pages);
> > +       folio_set_dirty(folio);
> > +
> > +       spin_lock(&si->lock);
> > +       /* Only sinple page folio can be backed by zswap */
> > +       if (!nr_pages)
> > +               zswap_invalidate(entry);
>
> I am trying to figure out if I am mad :-)  Does nr_pages == 0 means single
> page folio?
>

Hi Barry

I'm sorry, this should be nr_pages == 1, I messed up order and nr, as
zswap only works for single page folios.
Chris Li Aug. 4, 2024, 6:06 p.m. UTC | #3
On Sat, Aug 3, 2024 at 5:19 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Sat, Aug 3, 2024 at 6:39 PM Barry Song <baohua@kernel.org> wrote:
> >
> > On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Currently we free the reclaimed slots through slot cache even
> > > if the slot is required to be empty immediately. As a result
> > > the reclaim caller will see the slot still occupied even after a
> > > successful reclaim, and need to keep reclaiming until slot cache
> > > get flushed. This caused ineffective or over reclaim when SWAP is
> > > under stress.
> > >
> > > So introduce a new flag allowing the slot to be emptied bypassing
> > > the slot cache.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
> > >  1 file changed, 109 insertions(+), 43 deletions(-)
> > >
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 9b63b2262cc2..4c0fc0409d3c 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -53,8 +53,15 @@
> > >  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> > >                                  unsigned char);
> > >  static void free_swap_count_continuations(struct swap_info_struct *);
> > > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
> > > +                                 unsigned int nr_pages);
> > >  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
> > >                              unsigned int nr_entries);
> > > +static bool folio_swapcache_freeable(struct folio *folio);
> > > +static struct swap_cluster_info *lock_cluster_or_swap_info(
> > > +               struct swap_info_struct *si, unsigned long offset);
> > > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> > > +                                       struct swap_cluster_info *ci);
> > >
> > >  static DEFINE_SPINLOCK(swap_lock);
> > >  static unsigned int nr_swapfiles;
> > > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent)
> > >   * corresponding page
> > >   */
> > >  #define TTRS_UNMAPPED          0x2
> > > -/* Reclaim the swap entry if swap is getting full*/
> > > +/* Reclaim the swap entry if swap is getting full */
> > >  #define TTRS_FULL              0x4
> > > +/* Reclaim directly, bypass the slot cache and don't touch device lock */
> > > +#define TTRS_DIRECT            0x8
> > > +
> > > +static bool swap_is_has_cache(struct swap_info_struct *si,
> > > +                             unsigned long offset, int nr_pages)
> > > +{
> > > +       unsigned char *map = si->swap_map + offset;
> > > +       unsigned char *map_end = map + nr_pages;
> > > +
> > > +       do {
> > > +               VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
> > > +               if (*map != SWAP_HAS_CACHE)
> > > +                       return false;
> > > +       } while (++map < map_end);
> > > +
> > > +       return true;
> > > +}
> > >
> > >  /*
> > >   * returns number of pages in the folio that backs the swap entry. If positive,
> > > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > >                                  unsigned long offset, unsigned long flags)
> > >  {
> > >         swp_entry_t entry = swp_entry(si->type, offset);
> > > +       struct address_space *address_space = swap_address_space(entry);
> > > +       struct swap_cluster_info *ci;
> > >         struct folio *folio;
> > > -       int ret = 0;
> > > +       int ret, nr_pages;
> > > +       bool need_reclaim;
> > >
> > > -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> > > +       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> > >         if (IS_ERR(folio))
> > >                 return 0;
> > > +
> > > +       /* offset could point to the middle of a large folio */
> > > +       entry = folio->swap;
> > > +       offset = swp_offset(entry);
> > > +       nr_pages = folio_nr_pages(folio);
> > > +       ret = -nr_pages;
> > > +
> > >         /*
> > >          * When this function is called from scan_swap_map_slots() and it's
> > >          * called by vmscan.c at reclaiming folios. So we hold a folio lock
> > > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > >          * case and you should use folio_free_swap() with explicit folio_lock()
> > >          * in usual operations.
> > >          */
> > > -       if (folio_trylock(folio)) {
> > > -               if ((flags & TTRS_ANYWAY) ||
> > > -                   ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > > -                   ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
> > > -                       ret = folio_free_swap(folio);
> > > -               folio_unlock(folio);
> > > +       if (!folio_trylock(folio))
> > > +               goto out;
> > > +
> > > +       need_reclaim = ((flags & TTRS_ANYWAY) ||
> > > +                       ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > > +                       ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
> > > +       if (!need_reclaim || !folio_swapcache_freeable(folio))
> > > +               goto out_unlock;
> > > +
> > > +       /*
> > > +        * It's safe to delete the folio from swap cache only if the folio's
> > > +        * swap_map is HAS_CACHE only, which means the slots have no page table
> > > +        * reference or pending writeback, and can't be allocated to others.
> > > +        */
> > > +       ci = lock_cluster_or_swap_info(si, offset);
> > > +       need_reclaim = swap_is_has_cache(si, offset, nr_pages);
> > > +       unlock_cluster_or_swap_info(si, ci);
> > > +       if (!need_reclaim)
> > > +               goto out_unlock;
> > > +
> > > +       if (!(flags & TTRS_DIRECT)) {
> > > +               /* Free through slot cache */
> > > +               delete_from_swap_cache(folio);
> > > +               folio_set_dirty(folio);
> > > +               ret = nr_pages;
> > > +               goto out_unlock;
> > >         }
> > > -       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> > > +
> > > +       xa_lock_irq(&address_space->i_pages);
> > > +       __delete_from_swap_cache(folio, entry, NULL);
> > > +       xa_unlock_irq(&address_space->i_pages);
> > > +       folio_ref_sub(folio, nr_pages);
> > > +       folio_set_dirty(folio);
> > > +
> > > +       spin_lock(&si->lock);
> > > +       /* Only sinple page folio can be backed by zswap */
> > > +       if (!nr_pages)
> > > +               zswap_invalidate(entry);
> >
> > I am trying to figure out if I am mad :-)  Does nr_pages == 0 means single
> > page folio?
> >
>
> Hi Barry
>
> I'm sorry, this should be nr_pages == 1, I messed up order and nr, as
> zswap only works for single page folios.
Ack. Should be  nr_pages == 1.

Barry, thanks for catching that.

Chris
Barry Song Aug. 5, 2024, 1:53 a.m. UTC | #4
On Mon, Aug 5, 2024 at 6:07 AM Chris Li <chrisl@kernel.org> wrote:

> > > > +
> > > > +       spin_lock(&si->lock);
> > > > +       /* Only sinple page folio can be backed by zswap */
> > > > +       if (!nr_pages)
> > > > +               zswap_invalidate(entry);
> > >
> > > I am trying to figure out if I am mad :-)  Does nr_pages == 0 means single
> > > page folio?
> > >
> >
> > Hi Barry
> >
> > I'm sorry, this should be nr_pages == 1, I messed up order and nr, as
> > zswap only works for single page folios.
> Ack. Should be  nr_pages == 1.
>

Yes. Andrew, can you please help squash the below fix,

small folios should have nr_pages == 1 but not nr_page == 0

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..6de12d712c7e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -224,7 +224,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 
 	spin_lock(&si->lock);
 	/* Only sinple page folio can be backed by zswap */
-	if (!nr_pages)
+	if (nr_pages == 1)
 		zswap_invalidate(entry);
 	swap_entry_range_free(si, entry, nr_pages);
 	spin_unlock(&si->lock);

> Barry, thanks for catching that.
>
> Chris

Thanks
Barry
diff mbox series

Patch

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9b63b2262cc2..4c0fc0409d3c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,8 +53,15 @@ 
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 				 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
+static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
+				  unsigned int nr_pages);
 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 			     unsigned int nr_entries);
+static bool folio_swapcache_freeable(struct folio *folio);
+static struct swap_cluster_info *lock_cluster_or_swap_info(
+		struct swap_info_struct *si, unsigned long offset);
+static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+					struct swap_cluster_info *ci);
 
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -129,8 +136,25 @@  static inline unsigned char swap_count(unsigned char ent)
  * corresponding page
  */
 #define TTRS_UNMAPPED		0x2
-/* Reclaim the swap entry if swap is getting full*/
+/* Reclaim the swap entry if swap is getting full */
 #define TTRS_FULL		0x4
+/* Reclaim directly, bypass the slot cache and don't touch device lock */
+#define TTRS_DIRECT		0x8
+
+static bool swap_is_has_cache(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+
+	do {
+		VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
+		if (*map != SWAP_HAS_CACHE)
+			return false;
+	} while (++map < map_end);
+
+	return true;
+}
 
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
@@ -141,12 +165,22 @@  static int __try_to_reclaim_swap(struct swap_info_struct *si,
 				 unsigned long offset, unsigned long flags)
 {
 	swp_entry_t entry = swp_entry(si->type, offset);
+	struct address_space *address_space = swap_address_space(entry);
+	struct swap_cluster_info *ci;
 	struct folio *folio;
-	int ret = 0;
+	int ret, nr_pages;
+	bool need_reclaim;
 
-	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
+	folio = filemap_get_folio(address_space, swap_cache_index(entry));
 	if (IS_ERR(folio))
 		return 0;
+
+	/* offset could point to the middle of a large folio */
+	entry = folio->swap;
+	offset = swp_offset(entry);
+	nr_pages = folio_nr_pages(folio);
+	ret = -nr_pages;
+
 	/*
 	 * When this function is called from scan_swap_map_slots() and it's
 	 * called by vmscan.c at reclaiming folios. So we hold a folio lock
@@ -154,14 +188,50 @@  static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	 * case and you should use folio_free_swap() with explicit folio_lock()
 	 * in usual operations.
 	 */
-	if (folio_trylock(folio)) {
-		if ((flags & TTRS_ANYWAY) ||
-		    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
-		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
-			ret = folio_free_swap(folio);
-		folio_unlock(folio);
+	if (!folio_trylock(folio))
+		goto out;
+
+	need_reclaim = ((flags & TTRS_ANYWAY) ||
+			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
+	if (!need_reclaim || !folio_swapcache_freeable(folio))
+		goto out_unlock;
+
+	/*
+	 * It's safe to delete the folio from swap cache only if the folio's
+	 * swap_map is HAS_CACHE only, which means the slots have no page table
+	 * reference or pending writeback, and can't be allocated to others.
+	 */
+	ci = lock_cluster_or_swap_info(si, offset);
+	need_reclaim = swap_is_has_cache(si, offset, nr_pages);
+	unlock_cluster_or_swap_info(si, ci);
+	if (!need_reclaim)
+		goto out_unlock;
+
+	if (!(flags & TTRS_DIRECT)) {
+		/* Free through slot cache */
+		delete_from_swap_cache(folio);
+		folio_set_dirty(folio);
+		ret = nr_pages;
+		goto out_unlock;
 	}
-	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
+
+	xa_lock_irq(&address_space->i_pages);
+	__delete_from_swap_cache(folio, entry, NULL);
+	xa_unlock_irq(&address_space->i_pages);
+	folio_ref_sub(folio, nr_pages);
+	folio_set_dirty(folio);
+
+	spin_lock(&si->lock);
+	/* Only sinple page folio can be backed by zswap */
+	if (!nr_pages)
+		zswap_invalidate(entry);
+	swap_entry_range_free(si, entry, nr_pages);
+	spin_unlock(&si->lock);
+	ret = nr_pages;
+out_unlock:
+	folio_unlock(folio);
+out:
 	folio_put(folio);
 	return ret;
 }
@@ -903,7 +973,7 @@  static int scan_swap_map_slots(struct swap_info_struct *si,
 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
 		spin_unlock(&si->lock);
-		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
 		spin_lock(&si->lock);
 		/* entry was freed successfully, try to use this again */
 		if (swap_was_freed > 0)
@@ -1340,9 +1410,6 @@  void put_swap_folio(struct folio *folio, swp_entry_t entry)
 	unsigned long offset = swp_offset(entry);
 	struct swap_cluster_info *ci;
 	struct swap_info_struct *si;
-	unsigned char *map;
-	unsigned int i, free_entries = 0;
-	unsigned char val;
 	int size = 1 << swap_entry_order(folio_order(folio));
 
 	si = _swap_info_get(entry);
@@ -1350,23 +1417,14 @@  void put_swap_folio(struct folio *folio, swp_entry_t entry)
 		return;
 
 	ci = lock_cluster_or_swap_info(si, offset);
-	if (size > 1) {
-		map = si->swap_map + offset;
-		for (i = 0; i < size; i++) {
-			val = map[i];
-			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-			if (val == SWAP_HAS_CACHE)
-				free_entries++;
-		}
-		if (free_entries == size) {
-			unlock_cluster_or_swap_info(si, ci);
-			spin_lock(&si->lock);
-			swap_entry_range_free(si, entry, size);
-			spin_unlock(&si->lock);
-			return;
-		}
+	if (size > 1 && swap_is_has_cache(si, offset, size)) {
+		unlock_cluster_or_swap_info(si, ci);
+		spin_lock(&si->lock);
+		swap_entry_range_free(si, entry, size);
+		spin_unlock(&si->lock);
+		return;
 	}
-	for (i = 0; i < size; i++, entry.val++) {
+	for (int i = 0; i < size; i++, entry.val++) {
 		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
 			unlock_cluster_or_swap_info(si, ci);
 			free_swap_slot(entry);
@@ -1526,16 +1584,7 @@  static bool folio_swapped(struct folio *folio)
 	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
 }
 
-/**
- * folio_free_swap() - Free the swap space used for this folio.
- * @folio: The folio to remove.
- *
- * If swap is getting full, or if there are no more mappings of this folio,
- * then call folio_free_swap to free its swap space.
- *
- * Return: true if we were able to release the swap space.
- */
-bool folio_free_swap(struct folio *folio)
+static bool folio_swapcache_freeable(struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
@@ -1543,8 +1592,6 @@  bool folio_free_swap(struct folio *folio)
 		return false;
 	if (folio_test_writeback(folio))
 		return false;
-	if (folio_swapped(folio))
-		return false;
 
 	/*
 	 * Once hibernation has begun to create its image of memory,
@@ -1564,6 +1611,25 @@  bool folio_free_swap(struct folio *folio)
 	if (pm_suspended_storage())
 		return false;
 
+	return true;
+}
+
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
+ *
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
+ */
+bool folio_free_swap(struct folio *folio)
+{
+	if (!folio_swapcache_freeable(folio))
+		return false;
+	if (folio_swapped(folio))
+		return false;
+
 	delete_from_swap_cache(folio);
 	folio_set_dirty(folio);
 	return true;
@@ -1640,7 +1706,7 @@  void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 			 * to the next boundary.
 			 */
 			nr = __try_to_reclaim_swap(si, offset,
-					      TTRS_UNMAPPED | TTRS_FULL);
+						   TTRS_UNMAPPED | TTRS_FULL);
 			if (nr == 0)
 				nr = 1;
 			else if (nr < 0)