diff mbox series

[v5,5/9] mm: swap: skip slot cache on freeing for mTHP

Message ID 20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org (mailing list archive)
State New
Headers show
Series mm: swap: mTHP swap allocator base on swap cluster order | expand

Commit Message

Chris Li July 31, 2024, 6:49 a.m. UTC
From: Kairui Song <kasong@tencent.com>

Currently when we are freeing mTHP folios from swap cache, we free
then one by one and put each entry into swap slot cache. Slot
cache is designed to reduce the overhead by batching the freeing,
but mTHP swap entries are already continuous so they can be batch
freed without it already, it saves litle overhead, or even increase
overhead for larger mTHP.

What's more, mTHP entries could stay in swap cache for a while.
Contiguous swap entry is an rather rare resource so releasing them
directly can help improve mTHP allocation success rate when under
pressure.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swapfile.c | 59 ++++++++++++++++++++++++++---------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

Comments

Barry Song Aug. 3, 2024, 9:11 a.m. UTC | #1
On Wed, Jul 31, 2024 at 6:49 PM <chrisl@kernel.org> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Currently when we are freeing mTHP folios from swap cache, we free
> then one by one and put each entry into swap slot cache. Slot
> cache is designed to reduce the overhead by batching the freeing,
> but mTHP swap entries are already continuous so they can be batch
> freed without it already, it saves litle overhead, or even increase
> overhead for larger mTHP.
>
> What's more, mTHP entries could stay in swap cache for a while.
> Contiguous swap entry is an rather rare resource so releasing them
> directly can help improve mTHP allocation success rate when under
> pressure.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>

Acked-by: Barry Song <baohua@kernel.org>

I believe this is the right direction to take. Currently, entries are released
one by one, even when they are contiguous in the swap file(those nr_pages
entries are definitely in the same cluster and same si), leading to numerous
lock and unlock operations.
This approach provides batched support.

free_swap_and_cache_nr() has the same issue, so I drafted a patch based on
your code. I wonder if you can also help test and review before I send it
officially:

From 4bed5c08bc0f7769ee2849812acdad70c4e32ead Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 3 Aug 2024 20:21:14 +1200
Subject: [PATCH RFC] mm: attempt to batch free swap entries for zap_pte_range()

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/swapfile.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..9def6dba8d26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *any_only_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*any_only_cache = cached;
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1808,6 +1827,29 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (WARN_ON(end_offset > si->max))
 		goto out;
 
+	if (nr > 1) {
+		struct swap_cluster_info *ci;
+		bool batched_free;
+		int i;
+
+		ci = lock_cluster_or_swap_info(si, start_offset);
+		if ((batched_free = swap_is_last_map(si, start_offset, nr, &any_only_cache))) {
+			for (i = 0; i < nr; i++)
+				WRITE_ONCE(si->swap_map[start_offset + i], SWAP_HAS_CACHE);
+		}
+		unlock_cluster_or_swap_info(si, ci);
+
+		if (batched_free) {
+			spin_lock(&si->lock);
+			pr_err("%s offset:%lx nr:%lx\n", __func__,start_offset, nr);
+			swap_entry_range_free(si, entry, nr);
+			spin_unlock(&si->lock);
+			if (any_only_cache)
+				goto reclaim;
+			goto out;
+		}
+	}
+
 	/*
 	 * First free all entries in the range.
 	 */
@@ -1828,6 +1870,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (!any_only_cache)
 		goto out;
 
+reclaim:
 	/*
 	 * Now go back over the range trying to reclaim the swap cache. This is
 	 * more efficient for large folios because we will only try to reclaim
Barry Song Aug. 3, 2024, 10:57 a.m. UTC | #2
On Sat, Aug 3, 2024 at 9:11 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Jul 31, 2024 at 6:49 PM <chrisl@kernel.org> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently when we are freeing mTHP folios from swap cache, we free
> > then one by one and put each entry into swap slot cache. Slot
> > cache is designed to reduce the overhead by batching the freeing,
> > but mTHP swap entries are already continuous so they can be batch
> > freed without it already, it saves litle overhead, or even increase
> > overhead for larger mTHP.
> >
> > What's more, mTHP entries could stay in swap cache for a while.
> > Contiguous swap entry is an rather rare resource so releasing them
> > directly can help improve mTHP allocation success rate when under
> > pressure.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
>
> Acked-by: Barry Song <baohua@kernel.org>
>
> I believe this is the right direction to take. Currently, entries are released
> one by one, even when they are contiguous in the swap file(those nr_pages
> entries are definitely in the same cluster and same si), leading to numerous
> lock and unlock operations.
> This approach provides batched support.
>
> free_swap_and_cache_nr() has the same issue, so I drafted a patch based on
> your code. I wonder if you can also help test and review before I send it
> officially:
>
> From 4bed5c08bc0f7769ee2849812acdad70c4e32ead Mon Sep 17 00:00:00 2001
> From: Barry Song <v-songbaohua@oppo.com>
> Date: Sat, 3 Aug 2024 20:21:14 +1200
> Subject: [PATCH RFC] mm: attempt to batch free swap entries for zap_pte_range()
>
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
>
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/swapfile.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 43 insertions(+)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ea023fc25d08..9def6dba8d26 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>         return true;
>  }
>
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +                             unsigned long offset, int nr_pages,
> +                             bool *any_only_cache)
> +{
> +       unsigned char *map = si->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +       bool cached = false;
> +
> +       do {
> +               if ((*map & ~SWAP_HAS_CACHE) != 1)
> +                       return false;
> +               if (*map & SWAP_HAS_CACHE)
> +                       cached = true;
> +       } while (++map < map_end);
> +
> +       *any_only_cache = cached;
> +       return true;
> +}
> +
>  /*
>   * returns number of pages in the folio that backs the swap entry. If positive,
>   * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1808,6 +1827,29 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         if (WARN_ON(end_offset > si->max))
>                 goto out;
>
> +       if (nr > 1) {
> +               struct swap_cluster_info *ci;
> +               bool batched_free;
> +               int i;
> +
> +               ci = lock_cluster_or_swap_info(si, start_offset);
> +               if ((batched_free = swap_is_last_map(si, start_offset, nr, &any_only_cache))) {
> +                       for (i = 0; i < nr; i++)
> +                               WRITE_ONCE(si->swap_map[start_offset + i], SWAP_HAS_CACHE);
> +               }
> +               unlock_cluster_or_swap_info(si, ci);
> +
> +               if (batched_free) {
> +                       spin_lock(&si->lock);
> +                       pr_err("%s offset:%lx nr:%lx\n", __func__,start_offset, nr);
> +                       swap_entry_range_free(si, entry, nr);
> +                       spin_unlock(&si->lock);
> +                       if (any_only_cache)
> +                               goto reclaim;
> +                       goto out;
> +               }

Sorry, what I actually meant was that the two gotos are reversed.

                if (batched_free) {
                        if (any_only_cache)
                                goto reclaim;

                        spin_lock(&si->lock);
                        swap_entry_range_free(si, entry, nr);
                        spin_unlock(&si->lock);
                        goto out;
                }

> +       }
> +
>         /*
>          * First free all entries in the range.
>          */
> @@ -1828,6 +1870,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         if (!any_only_cache)
>                 goto out;
>
> +reclaim:
>         /*
>          * Now go back over the range trying to reclaim the swap cache. This is
>          * more efficient for large folios because we will only try to reclaim
> --
> 2.34.1
>
>
>
> > ---
> >  mm/swapfile.c | 59 ++++++++++++++++++++++++++---------------------------------
> >  1 file changed, 26 insertions(+), 33 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 34e6ea13e8e4..9b63b2262cc2 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -479,20 +479,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> >  }
> >
> >  /*
> > - * The cluster ci decreases one usage. If the usage counter becomes 0,
> > + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
> >   * which means no page in the cluster is in use, we can optionally discard
> >   * the cluster and add it to free cluster list.
> >   */
> > -static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> > +static void dec_cluster_info_page(struct swap_info_struct *p,
> > +                                 struct swap_cluster_info *ci, int nr_pages)
> >  {
> >         if (!p->cluster_info)
> >                 return;
> >
> > -       VM_BUG_ON(ci->count == 0);
> > +       VM_BUG_ON(ci->count < nr_pages);
> >         VM_BUG_ON(cluster_is_free(ci));
> >         lockdep_assert_held(&p->lock);
> >         lockdep_assert_held(&ci->lock);
> > -       ci->count--;
> > +       ci->count -= nr_pages;
> >
> >         if (!ci->count) {
> >                 free_cluster(p, ci);
> > @@ -998,19 +999,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
> >         return n_ret;
> >  }
> >
> > -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> > -{
> > -       unsigned long offset = idx * SWAPFILE_CLUSTER;
> > -       struct swap_cluster_info *ci;
> > -
> > -       ci = lock_cluster(si, offset);
> > -       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> > -       ci->count = 0;
> > -       free_cluster(si, ci);
> > -       unlock_cluster(ci);
> > -       swap_range_free(si, offset, SWAPFILE_CLUSTER);
> > -}
> > -
> >  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> >  {
> >         int order = swap_entry_order(entry_order);
> > @@ -1269,21 +1257,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
> >         return usage;
> >  }
> >
> > -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> > +/*
> > + * Drop the last HAS_CACHE flag of swap entries, caller have to
> > + * ensure all entries belong to the same cgroup.
> > + */
> > +static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
> > +                                 unsigned int nr_pages)
> >  {
> > -       struct swap_cluster_info *ci;
> >         unsigned long offset = swp_offset(entry);
> > -       unsigned char count;
> > +       unsigned char *map = p->swap_map + offset;
> > +       unsigned char *map_end = map + nr_pages;
> > +       struct swap_cluster_info *ci;
> >
> >         ci = lock_cluster(p, offset);
> > -       count = p->swap_map[offset];
> > -       VM_BUG_ON(count != SWAP_HAS_CACHE);
> > -       p->swap_map[offset] = 0;
> > -       dec_cluster_info_page(p, ci);
> > +       do {
> > +               VM_BUG_ON(*map != SWAP_HAS_CACHE);
> > +               *map = 0;
> > +       } while (++map < map_end);
> > +       dec_cluster_info_page(p, ci, nr_pages);
> >         unlock_cluster(ci);
> >
> > -       mem_cgroup_uncharge_swap(entry, 1);
> > -       swap_range_free(p, offset, 1);
> > +       mem_cgroup_uncharge_swap(entry, nr_pages);
> > +       swap_range_free(p, offset, nr_pages);
> >  }
> >
> >  static void cluster_swap_free_nr(struct swap_info_struct *sis,
> > @@ -1343,7 +1338,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
> >  void put_swap_folio(struct folio *folio, swp_entry_t entry)
> >  {
> >         unsigned long offset = swp_offset(entry);
> > -       unsigned long idx = offset / SWAPFILE_CLUSTER;
> >         struct swap_cluster_info *ci;
> >         struct swap_info_struct *si;
> >         unsigned char *map;
> > @@ -1356,19 +1350,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
> >                 return;
> >
> >         ci = lock_cluster_or_swap_info(si, offset);
> > -       if (size == SWAPFILE_CLUSTER) {
> > +       if (size > 1) {
> >                 map = si->swap_map + offset;
> > -               for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> > +               for (i = 0; i < size; i++) {
> >                         val = map[i];
> >                         VM_BUG_ON(!(val & SWAP_HAS_CACHE));
> >                         if (val == SWAP_HAS_CACHE)
> >                                 free_entries++;
> >                 }
> > -               if (free_entries == SWAPFILE_CLUSTER) {
> > +               if (free_entries == size) {
> >                         unlock_cluster_or_swap_info(si, ci);
> >                         spin_lock(&si->lock);
> > -                       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
> > -                       swap_free_cluster(si, idx);
> > +                       swap_entry_range_free(si, entry, size);
> >                         spin_unlock(&si->lock);
> >                         return;
> >                 }
> > @@ -1413,7 +1406,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
> >         for (i = 0; i < n; ++i) {
> >                 p = swap_info_get_cont(entries[i], prev);
> >                 if (p)
> > -                       swap_entry_free(p, entries[i]);
> > +                       swap_entry_range_free(p, entries[i], 1);
> >                 prev = p;
> >         }
> >         if (p)
> >
> > --
> > 2.46.0.rc1.232.g9752f9e123-goog
> >
>
> Thanks
> Barry
>
diff mbox series

Patch

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 34e6ea13e8e4..9b63b2262cc2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -479,20 +479,21 @@  static void inc_cluster_info_page(struct swap_info_struct *p,
 }
 
 /*
- * The cluster ci decreases one usage. If the usage counter becomes 0,
+ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
  * which means no page in the cluster is in use, we can optionally discard
  * the cluster and add it to free cluster list.
  */
-static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
+static void dec_cluster_info_page(struct swap_info_struct *p,
+				  struct swap_cluster_info *ci, int nr_pages)
 {
 	if (!p->cluster_info)
 		return;
 
-	VM_BUG_ON(ci->count == 0);
+	VM_BUG_ON(ci->count < nr_pages);
 	VM_BUG_ON(cluster_is_free(ci));
 	lockdep_assert_held(&p->lock);
 	lockdep_assert_held(&ci->lock);
-	ci->count--;
+	ci->count -= nr_pages;
 
 	if (!ci->count) {
 		free_cluster(p, ci);
@@ -998,19 +999,6 @@  static int scan_swap_map_slots(struct swap_info_struct *si,
 	return n_ret;
 }
 
-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
-{
-	unsigned long offset = idx * SWAPFILE_CLUSTER;
-	struct swap_cluster_info *ci;
-
-	ci = lock_cluster(si, offset);
-	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
-	ci->count = 0;
-	free_cluster(si, ci);
-	unlock_cluster(ci);
-	swap_range_free(si, offset, SWAPFILE_CLUSTER);
-}
-
 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 {
 	int order = swap_entry_order(entry_order);
@@ -1269,21 +1257,28 @@  static unsigned char __swap_entry_free(struct swap_info_struct *p,
 	return usage;
 }
 
-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+/*
+ * Drop the last HAS_CACHE flag of swap entries, caller have to
+ * ensure all entries belong to the same cgroup.
+ */
+static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
+				  unsigned int nr_pages)
 {
-	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
-	unsigned char count;
+	unsigned char *map = p->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	struct swap_cluster_info *ci;
 
 	ci = lock_cluster(p, offset);
-	count = p->swap_map[offset];
-	VM_BUG_ON(count != SWAP_HAS_CACHE);
-	p->swap_map[offset] = 0;
-	dec_cluster_info_page(p, ci);
+	do {
+		VM_BUG_ON(*map != SWAP_HAS_CACHE);
+		*map = 0;
+	} while (++map < map_end);
+	dec_cluster_info_page(p, ci, nr_pages);
 	unlock_cluster(ci);
 
-	mem_cgroup_uncharge_swap(entry, 1);
-	swap_range_free(p, offset, 1);
+	mem_cgroup_uncharge_swap(entry, nr_pages);
+	swap_range_free(p, offset, nr_pages);
 }
 
 static void cluster_swap_free_nr(struct swap_info_struct *sis,
@@ -1343,7 +1338,6 @@  void swap_free_nr(swp_entry_t entry, int nr_pages)
 void put_swap_folio(struct folio *folio, swp_entry_t entry)
 {
 	unsigned long offset = swp_offset(entry);
-	unsigned long idx = offset / SWAPFILE_CLUSTER;
 	struct swap_cluster_info *ci;
 	struct swap_info_struct *si;
 	unsigned char *map;
@@ -1356,19 +1350,18 @@  void put_swap_folio(struct folio *folio, swp_entry_t entry)
 		return;
 
 	ci = lock_cluster_or_swap_info(si, offset);
-	if (size == SWAPFILE_CLUSTER) {
+	if (size > 1) {
 		map = si->swap_map + offset;
-		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		for (i = 0; i < size; i++) {
 			val = map[i];
 			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
 			if (val == SWAP_HAS_CACHE)
 				free_entries++;
 		}
-		if (free_entries == SWAPFILE_CLUSTER) {
+		if (free_entries == size) {
 			unlock_cluster_or_swap_info(si, ci);
 			spin_lock(&si->lock);
-			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-			swap_free_cluster(si, idx);
+			swap_entry_range_free(si, entry, size);
 			spin_unlock(&si->lock);
 			return;
 		}
@@ -1413,7 +1406,7 @@  void swapcache_free_entries(swp_entry_t *entries, int n)
 	for (i = 0; i < n; ++i) {
 		p = swap_info_get_cont(entries[i], prev);
 		if (p)
-			swap_entry_free(p, entries[i]);
+			swap_entry_range_free(p, entries[i], 1);
 		prev = p;
 	}
 	if (p)