Message ID | 20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: swap: mTHP swap allocator base on swap cluster order | expand |
On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote: > > From: Kairui Song <kasong@tencent.com> > > Currently we free the reclaimed slots through slot cache even > if the slot is required to be empty immediately. As a result > the reclaim caller will see the slot still occupied even after a > successful reclaim, and need to keep reclaiming until slot cache > get flushed. This caused ineffective or over reclaim when SWAP is > under stress. > > So introduce a new flag allowing the slot to be emptied bypassing > the slot cache. > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++----------------- > 1 file changed, 109 insertions(+), 43 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 9b63b2262cc2..4c0fc0409d3c 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -53,8 +53,15 @@ > static bool swap_count_continued(struct swap_info_struct *, pgoff_t, > unsigned char); > static void free_swap_count_continuations(struct swap_info_struct *); > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, > + unsigned int nr_pages); > static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, > unsigned int nr_entries); > +static bool folio_swapcache_freeable(struct folio *folio); > +static struct swap_cluster_info *lock_cluster_or_swap_info( > + struct swap_info_struct *si, unsigned long offset); > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, > + struct swap_cluster_info *ci); > > static DEFINE_SPINLOCK(swap_lock); > static unsigned int nr_swapfiles; > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent) > * corresponding page > */ > #define TTRS_UNMAPPED 0x2 > -/* Reclaim the swap entry if swap is getting full*/ > +/* Reclaim the swap entry if swap is getting full */ > #define TTRS_FULL 0x4 > +/* Reclaim directly, bypass the slot cache and don't touch device lock */ > +#define TTRS_DIRECT 0x8 > + > +static bool swap_is_has_cache(struct swap_info_struct *si, > + unsigned long offset, int nr_pages) > +{ > + unsigned char *map = si->swap_map + offset; > + unsigned char *map_end = map + nr_pages; > + > + do { > + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); > + if (*map != SWAP_HAS_CACHE) > + return false; > + } while (++map < map_end); > + > + return true; > +} > > /* > * returns number of pages in the folio that backs the swap entry. If positive, > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > unsigned long offset, unsigned long flags) > { > swp_entry_t entry = swp_entry(si->type, offset); > + struct address_space *address_space = swap_address_space(entry); > + struct swap_cluster_info *ci; > struct folio *folio; > - int ret = 0; > + int ret, nr_pages; > + bool need_reclaim; > > - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > + folio = filemap_get_folio(address_space, swap_cache_index(entry)); > if (IS_ERR(folio)) > return 0; > + > + /* offset could point to the middle of a large folio */ > + entry = folio->swap; > + offset = swp_offset(entry); > + nr_pages = folio_nr_pages(folio); > + ret = -nr_pages; > + > /* > * When this function is called from scan_swap_map_slots() and it's > * called by vmscan.c at reclaiming folios. So we hold a folio lock > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > * case and you should use folio_free_swap() with explicit folio_lock() > * in usual operations. > */ > - if (folio_trylock(folio)) { > - if ((flags & TTRS_ANYWAY) || > - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) > - ret = folio_free_swap(folio); > - folio_unlock(folio); > + if (!folio_trylock(folio)) > + goto out; > + > + need_reclaim = ((flags & TTRS_ANYWAY) || > + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > + if (!need_reclaim || !folio_swapcache_freeable(folio)) > + goto out_unlock; > + > + /* > + * It's safe to delete the folio from swap cache only if the folio's > + * swap_map is HAS_CACHE only, which means the slots have no page table > + * reference or pending writeback, and can't be allocated to others. > + */ > + ci = lock_cluster_or_swap_info(si, offset); > + need_reclaim = swap_is_has_cache(si, offset, nr_pages); > + unlock_cluster_or_swap_info(si, ci); > + if (!need_reclaim) > + goto out_unlock; > + > + if (!(flags & TTRS_DIRECT)) { > + /* Free through slot cache */ > + delete_from_swap_cache(folio); > + folio_set_dirty(folio); > + ret = nr_pages; > + goto out_unlock; > } > - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); > + > + xa_lock_irq(&address_space->i_pages); > + __delete_from_swap_cache(folio, entry, NULL); > + xa_unlock_irq(&address_space->i_pages); > + folio_ref_sub(folio, nr_pages); > + folio_set_dirty(folio); > + > + spin_lock(&si->lock); > + /* Only sinple page folio can be backed by zswap */ > + if (!nr_pages) > + zswap_invalidate(entry); I am trying to figure out if I am mad :-) Does nr_pages == 0 means single page folio? > + swap_entry_range_free(si, entry, nr_pages); > + spin_unlock(&si->lock); > + ret = nr_pages; > +out_unlock: > + folio_unlock(folio); > +out: > folio_put(folio); > return ret; > } > @@ -903,7 +973,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, > if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { > int swap_was_freed; > spin_unlock(&si->lock); > - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); > + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); > spin_lock(&si->lock); > /* entry was freed successfully, try to use this again */ > if (swap_was_freed > 0) > @@ -1340,9 +1410,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) > unsigned long offset = swp_offset(entry); > struct swap_cluster_info *ci; > struct swap_info_struct *si; > - unsigned char *map; > - unsigned int i, free_entries = 0; > - unsigned char val; > int size = 1 << swap_entry_order(folio_order(folio)); > > si = _swap_info_get(entry); > @@ -1350,23 +1417,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) > return; > > ci = lock_cluster_or_swap_info(si, offset); > - if (size > 1) { > - map = si->swap_map + offset; > - for (i = 0; i < size; i++) { > - val = map[i]; > - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); > - if (val == SWAP_HAS_CACHE) > - free_entries++; > - } > - if (free_entries == size) { > - unlock_cluster_or_swap_info(si, ci); > - spin_lock(&si->lock); > - swap_entry_range_free(si, entry, size); > - spin_unlock(&si->lock); > - return; > - } > + if (size > 1 && swap_is_has_cache(si, offset, size)) { > + unlock_cluster_or_swap_info(si, ci); > + spin_lock(&si->lock); > + swap_entry_range_free(si, entry, size); > + spin_unlock(&si->lock); > + return; > } > - for (i = 0; i < size; i++, entry.val++) { > + for (int i = 0; i < size; i++, entry.val++) { > if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { > unlock_cluster_or_swap_info(si, ci); > free_swap_slot(entry); > @@ -1526,16 +1584,7 @@ static bool folio_swapped(struct folio *folio) > return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); > } > > -/** > - * folio_free_swap() - Free the swap space used for this folio. > - * @folio: The folio to remove. > - * > - * If swap is getting full, or if there are no more mappings of this folio, > - * then call folio_free_swap to free its swap space. > - * > - * Return: true if we were able to release the swap space. > - */ > -bool folio_free_swap(struct folio *folio) > +static bool folio_swapcache_freeable(struct folio *folio) > { > VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); > > @@ -1543,8 +1592,6 @@ bool folio_free_swap(struct folio *folio) > return false; > if (folio_test_writeback(folio)) > return false; > - if (folio_swapped(folio)) > - return false; > > /* > * Once hibernation has begun to create its image of memory, > @@ -1564,6 +1611,25 @@ bool folio_free_swap(struct folio *folio) > if (pm_suspended_storage()) > return false; > > + return true; > +} > + > +/** > + * folio_free_swap() - Free the swap space used for this folio. > + * @folio: The folio to remove. > + * > + * If swap is getting full, or if there are no more mappings of this folio, > + * then call folio_free_swap to free its swap space. > + * > + * Return: true if we were able to release the swap space. > + */ > +bool folio_free_swap(struct folio *folio) > +{ > + if (!folio_swapcache_freeable(folio)) > + return false; > + if (folio_swapped(folio)) > + return false; > + > delete_from_swap_cache(folio); > folio_set_dirty(folio); > return true; > @@ -1640,7 +1706,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) > * to the next boundary. > */ > nr = __try_to_reclaim_swap(si, offset, > - TTRS_UNMAPPED | TTRS_FULL); > + TTRS_UNMAPPED | TTRS_FULL); > if (nr == 0) > nr = 1; > else if (nr < 0) > > -- > 2.46.0.rc1.232.g9752f9e123-goog >
On Sat, Aug 3, 2024 at 6:39 PM Barry Song <baohua@kernel.org> wrote: > > On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote: > > > > From: Kairui Song <kasong@tencent.com> > > > > Currently we free the reclaimed slots through slot cache even > > if the slot is required to be empty immediately. As a result > > the reclaim caller will see the slot still occupied even after a > > successful reclaim, and need to keep reclaiming until slot cache > > get flushed. This caused ineffective or over reclaim when SWAP is > > under stress. > > > > So introduce a new flag allowing the slot to be emptied bypassing > > the slot cache. > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++----------------- > > 1 file changed, 109 insertions(+), 43 deletions(-) > > > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index 9b63b2262cc2..4c0fc0409d3c 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -53,8 +53,15 @@ > > static bool swap_count_continued(struct swap_info_struct *, pgoff_t, > > unsigned char); > > static void free_swap_count_continuations(struct swap_info_struct *); > > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, > > + unsigned int nr_pages); > > static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, > > unsigned int nr_entries); > > +static bool folio_swapcache_freeable(struct folio *folio); > > +static struct swap_cluster_info *lock_cluster_or_swap_info( > > + struct swap_info_struct *si, unsigned long offset); > > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, > > + struct swap_cluster_info *ci); > > > > static DEFINE_SPINLOCK(swap_lock); > > static unsigned int nr_swapfiles; > > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent) > > * corresponding page > > */ > > #define TTRS_UNMAPPED 0x2 > > -/* Reclaim the swap entry if swap is getting full*/ > > +/* Reclaim the swap entry if swap is getting full */ > > #define TTRS_FULL 0x4 > > +/* Reclaim directly, bypass the slot cache and don't touch device lock */ > > +#define TTRS_DIRECT 0x8 > > + > > +static bool swap_is_has_cache(struct swap_info_struct *si, > > + unsigned long offset, int nr_pages) > > +{ > > + unsigned char *map = si->swap_map + offset; > > + unsigned char *map_end = map + nr_pages; > > + > > + do { > > + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); > > + if (*map != SWAP_HAS_CACHE) > > + return false; > > + } while (++map < map_end); > > + > > + return true; > > +} > > > > /* > > * returns number of pages in the folio that backs the swap entry. If positive, > > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > > unsigned long offset, unsigned long flags) > > { > > swp_entry_t entry = swp_entry(si->type, offset); > > + struct address_space *address_space = swap_address_space(entry); > > + struct swap_cluster_info *ci; > > struct folio *folio; > > - int ret = 0; > > + int ret, nr_pages; > > + bool need_reclaim; > > > > - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > > + folio = filemap_get_folio(address_space, swap_cache_index(entry)); > > if (IS_ERR(folio)) > > return 0; > > + > > + /* offset could point to the middle of a large folio */ > > + entry = folio->swap; > > + offset = swp_offset(entry); > > + nr_pages = folio_nr_pages(folio); > > + ret = -nr_pages; > > + > > /* > > * When this function is called from scan_swap_map_slots() and it's > > * called by vmscan.c at reclaiming folios. So we hold a folio lock > > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > > * case and you should use folio_free_swap() with explicit folio_lock() > > * in usual operations. > > */ > > - if (folio_trylock(folio)) { > > - if ((flags & TTRS_ANYWAY) || > > - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > > - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) > > - ret = folio_free_swap(folio); > > - folio_unlock(folio); > > + if (!folio_trylock(folio)) > > + goto out; > > + > > + need_reclaim = ((flags & TTRS_ANYWAY) || > > + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > > + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > > + if (!need_reclaim || !folio_swapcache_freeable(folio)) > > + goto out_unlock; > > + > > + /* > > + * It's safe to delete the folio from swap cache only if the folio's > > + * swap_map is HAS_CACHE only, which means the slots have no page table > > + * reference or pending writeback, and can't be allocated to others. > > + */ > > + ci = lock_cluster_or_swap_info(si, offset); > > + need_reclaim = swap_is_has_cache(si, offset, nr_pages); > > + unlock_cluster_or_swap_info(si, ci); > > + if (!need_reclaim) > > + goto out_unlock; > > + > > + if (!(flags & TTRS_DIRECT)) { > > + /* Free through slot cache */ > > + delete_from_swap_cache(folio); > > + folio_set_dirty(folio); > > + ret = nr_pages; > > + goto out_unlock; > > } > > - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); > > + > > + xa_lock_irq(&address_space->i_pages); > > + __delete_from_swap_cache(folio, entry, NULL); > > + xa_unlock_irq(&address_space->i_pages); > > + folio_ref_sub(folio, nr_pages); > > + folio_set_dirty(folio); > > + > > + spin_lock(&si->lock); > > + /* Only sinple page folio can be backed by zswap */ > > + if (!nr_pages) > > + zswap_invalidate(entry); > > I am trying to figure out if I am mad :-) Does nr_pages == 0 means single > page folio? > Hi Barry I'm sorry, this should be nr_pages == 1, I messed up order and nr, as zswap only works for single page folios.
On Sat, Aug 3, 2024 at 5:19 AM Kairui Song <ryncsn@gmail.com> wrote: > > On Sat, Aug 3, 2024 at 6:39 PM Barry Song <baohua@kernel.org> wrote: > > > > On Wed, Jul 31, 2024 at 2:49 PM <chrisl@kernel.org> wrote: > > > > > > From: Kairui Song <kasong@tencent.com> > > > > > > Currently we free the reclaimed slots through slot cache even > > > if the slot is required to be empty immediately. As a result > > > the reclaim caller will see the slot still occupied even after a > > > successful reclaim, and need to keep reclaiming until slot cache > > > get flushed. This caused ineffective or over reclaim when SWAP is > > > under stress. > > > > > > So introduce a new flag allowing the slot to be emptied bypassing > > > the slot cache. > > > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > > --- > > > mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++----------------- > > > 1 file changed, 109 insertions(+), 43 deletions(-) > > > > > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > > index 9b63b2262cc2..4c0fc0409d3c 100644 > > > --- a/mm/swapfile.c > > > +++ b/mm/swapfile.c > > > @@ -53,8 +53,15 @@ > > > static bool swap_count_continued(struct swap_info_struct *, pgoff_t, > > > unsigned char); > > > static void free_swap_count_continuations(struct swap_info_struct *); > > > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, > > > + unsigned int nr_pages); > > > static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, > > > unsigned int nr_entries); > > > +static bool folio_swapcache_freeable(struct folio *folio); > > > +static struct swap_cluster_info *lock_cluster_or_swap_info( > > > + struct swap_info_struct *si, unsigned long offset); > > > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, > > > + struct swap_cluster_info *ci); > > > > > > static DEFINE_SPINLOCK(swap_lock); > > > static unsigned int nr_swapfiles; > > > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent) > > > * corresponding page > > > */ > > > #define TTRS_UNMAPPED 0x2 > > > -/* Reclaim the swap entry if swap is getting full*/ > > > +/* Reclaim the swap entry if swap is getting full */ > > > #define TTRS_FULL 0x4 > > > +/* Reclaim directly, bypass the slot cache and don't touch device lock */ > > > +#define TTRS_DIRECT 0x8 > > > + > > > +static bool swap_is_has_cache(struct swap_info_struct *si, > > > + unsigned long offset, int nr_pages) > > > +{ > > > + unsigned char *map = si->swap_map + offset; > > > + unsigned char *map_end = map + nr_pages; > > > + > > > + do { > > > + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); > > > + if (*map != SWAP_HAS_CACHE) > > > + return false; > > > + } while (++map < map_end); > > > + > > > + return true; > > > +} > > > > > > /* > > > * returns number of pages in the folio that backs the swap entry. If positive, > > > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > > > unsigned long offset, unsigned long flags) > > > { > > > swp_entry_t entry = swp_entry(si->type, offset); > > > + struct address_space *address_space = swap_address_space(entry); > > > + struct swap_cluster_info *ci; > > > struct folio *folio; > > > - int ret = 0; > > > + int ret, nr_pages; > > > + bool need_reclaim; > > > > > > - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > > > + folio = filemap_get_folio(address_space, swap_cache_index(entry)); > > > if (IS_ERR(folio)) > > > return 0; > > > + > > > + /* offset could point to the middle of a large folio */ > > > + entry = folio->swap; > > > + offset = swp_offset(entry); > > > + nr_pages = folio_nr_pages(folio); > > > + ret = -nr_pages; > > > + > > > /* > > > * When this function is called from scan_swap_map_slots() and it's > > > * called by vmscan.c at reclaiming folios. So we hold a folio lock > > > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > > > * case and you should use folio_free_swap() with explicit folio_lock() > > > * in usual operations. > > > */ > > > - if (folio_trylock(folio)) { > > > - if ((flags & TTRS_ANYWAY) || > > > - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > > > - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) > > > - ret = folio_free_swap(folio); > > > - folio_unlock(folio); > > > + if (!folio_trylock(folio)) > > > + goto out; > > > + > > > + need_reclaim = ((flags & TTRS_ANYWAY) || > > > + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > > > + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > > > + if (!need_reclaim || !folio_swapcache_freeable(folio)) > > > + goto out_unlock; > > > + > > > + /* > > > + * It's safe to delete the folio from swap cache only if the folio's > > > + * swap_map is HAS_CACHE only, which means the slots have no page table > > > + * reference or pending writeback, and can't be allocated to others. > > > + */ > > > + ci = lock_cluster_or_swap_info(si, offset); > > > + need_reclaim = swap_is_has_cache(si, offset, nr_pages); > > > + unlock_cluster_or_swap_info(si, ci); > > > + if (!need_reclaim) > > > + goto out_unlock; > > > + > > > + if (!(flags & TTRS_DIRECT)) { > > > + /* Free through slot cache */ > > > + delete_from_swap_cache(folio); > > > + folio_set_dirty(folio); > > > + ret = nr_pages; > > > + goto out_unlock; > > > } > > > - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); > > > + > > > + xa_lock_irq(&address_space->i_pages); > > > + __delete_from_swap_cache(folio, entry, NULL); > > > + xa_unlock_irq(&address_space->i_pages); > > > + folio_ref_sub(folio, nr_pages); > > > + folio_set_dirty(folio); > > > + > > > + spin_lock(&si->lock); > > > + /* Only sinple page folio can be backed by zswap */ > > > + if (!nr_pages) > > > + zswap_invalidate(entry); > > > > I am trying to figure out if I am mad :-) Does nr_pages == 0 means single > > page folio? > > > > Hi Barry > > I'm sorry, this should be nr_pages == 1, I messed up order and nr, as > zswap only works for single page folios. Ack. Should be nr_pages == 1. Barry, thanks for catching that. Chris
On Mon, Aug 5, 2024 at 6:07 AM Chris Li <chrisl@kernel.org> wrote: > > > > + > > > > + spin_lock(&si->lock); > > > > + /* Only sinple page folio can be backed by zswap */ > > > > + if (!nr_pages) > > > > + zswap_invalidate(entry); > > > > > > I am trying to figure out if I am mad :-) Does nr_pages == 0 means single > > > page folio? > > > > > > > Hi Barry > > > > I'm sorry, this should be nr_pages == 1, I messed up order and nr, as > > zswap only works for single page folios. > Ack. Should be nr_pages == 1. > Yes. Andrew, can you please help squash the below fix, small folios should have nr_pages == 1 but not nr_page == 0 diff --git a/mm/swapfile.c b/mm/swapfile.c index ea023fc25d08..6de12d712c7e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -224,7 +224,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, spin_lock(&si->lock); /* Only sinple page folio can be backed by zswap */ - if (!nr_pages) + if (nr_pages == 1) zswap_invalidate(entry); swap_entry_range_free(si, entry, nr_pages); spin_unlock(&si->lock); > Barry, thanks for catching that. > > Chris Thanks Barry
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9b63b2262cc2..4c0fc0409d3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,8 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries); +static bool folio_swapcache_freeable(struct folio *folio); +static struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, unsigned long offset); +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent) * corresponding page */ #define TTRS_UNMAPPED 0x2 -/* Reclaim the swap entry if swap is getting full*/ +/* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 +/* Reclaim directly, bypass the slot cache and don't touch device lock */ +#define TTRS_DIRECT 0x8 + +static bool swap_is_has_cache(struct swap_info_struct *si, + unsigned long offset, int nr_pages) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + + do { + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); + if (*map != SWAP_HAS_CACHE) + return false; + } while (++map < map_end); + + return true; +} /* * returns number of pages in the folio that backs the swap entry. If positive, @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); + struct address_space *address_space = swap_address_space(entry); + struct swap_cluster_info *ci; struct folio *folio; - int ret = 0; + int ret, nr_pages; + bool need_reclaim; - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); + folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; + + /* offset could point to the middle of a large folio */ + entry = folio->swap; + offset = swp_offset(entry); + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (folio_trylock(folio)) { - if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) - ret = folio_free_swap(folio); - folio_unlock(folio); + if (!folio_trylock(folio)) + goto out; + + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); + if (!need_reclaim || !folio_swapcache_freeable(folio)) + goto out_unlock; + + /* + * It's safe to delete the folio from swap cache only if the folio's + * swap_map is HAS_CACHE only, which means the slots have no page table + * reference or pending writeback, and can't be allocated to others. + */ + ci = lock_cluster_or_swap_info(si, offset); + need_reclaim = swap_is_has_cache(si, offset, nr_pages); + unlock_cluster_or_swap_info(si, ci); + if (!need_reclaim) + goto out_unlock; + + if (!(flags & TTRS_DIRECT)) { + /* Free through slot cache */ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + ret = nr_pages; + goto out_unlock; } - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); + + xa_lock_irq(&address_space->i_pages); + __delete_from_swap_cache(folio, entry, NULL); + xa_unlock_irq(&address_space->i_pages); + folio_ref_sub(folio, nr_pages); + folio_set_dirty(folio); + + spin_lock(&si->lock); + /* Only sinple page folio can be backed by zswap */ + if (!nr_pages) + zswap_invalidate(entry); + swap_entry_range_free(si, entry, nr_pages); + spin_unlock(&si->lock); + ret = nr_pages; +out_unlock: + folio_unlock(folio); +out: folio_put(folio); return ret; } @@ -903,7 +973,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) @@ -1340,9 +1410,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned char *map; - unsigned int i, free_entries = 0; - unsigned char val; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); @@ -1350,23 +1417,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster_or_swap_info(si, offset); - if (size > 1) { - map = si->swap_map + offset; - for (i = 0; i < size; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (free_entries == size) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, size); - spin_unlock(&si->lock); - return; - } + if (size > 1 && swap_is_has_cache(si, offset, size)) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; } - for (i = 0; i < size; i++, entry.val++) { + for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); @@ -1526,16 +1584,7 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } -/** - * folio_free_swap() - Free the swap space used for this folio. - * @folio: The folio to remove. - * - * If swap is getting full, or if there are no more mappings of this folio, - * then call folio_free_swap to free its swap space. - * - * Return: true if we were able to release the swap space. - */ -bool folio_free_swap(struct folio *folio) +static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1543,8 +1592,6 @@ bool folio_free_swap(struct folio *folio) return false; if (folio_test_writeback(folio)) return false; - if (folio_swapped(folio)) - return false; /* * Once hibernation has begun to create its image of memory, @@ -1564,6 +1611,25 @@ bool folio_free_swap(struct folio *folio) if (pm_suspended_storage()) return false; + return true; +} + +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. + */ +bool folio_free_swap(struct folio *folio) +{ + if (!folio_swapcache_freeable(folio)) + return false; + if (folio_swapped(folio)) + return false; + delete_from_swap_cache(folio); folio_set_dirty(folio); return true; @@ -1640,7 +1706,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); + TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0)