diff mbox series

[RFC,2/6] mm: swap: introduce swap_nr_free() for batched swap_free()

Message ID 20240118111036.72641-3-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series mm: support large folios swap-in | expand

Commit Message

Barry Song Jan. 18, 2024, 11:10 a.m. UTC
From: Chuanhua Han <hanchuanhua@oppo.com>

While swapping in a large folio, we need to free swaps related to the whole
folio. To avoid frequently acquiring and releasing swap locks, it is better
to introduce an API for batched free.

Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
Co-developed-by: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 include/linux/swap.h |  6 ++++++
 mm/swapfile.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

Comments

Chris Li Jan. 26, 2024, 11:17 p.m. UTC | #1
On Thu, Jan 18, 2024 at 3:11 AM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Chuanhua Han <hanchuanhua@oppo.com>
>
> While swapping in a large folio, we need to free swaps related to the whole
> folio. To avoid frequently acquiring and releasing swap locks, it is better
> to introduce an API for batched free.
>
> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  include/linux/swap.h |  6 ++++++
>  mm/swapfile.c        | 29 +++++++++++++++++++++++++++++
>  2 files changed, 35 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 4db00ddad261..31a4ee2dcd1c 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -478,6 +478,7 @@ extern void swap_shmem_alloc(swp_entry_t);
>  extern int swap_duplicate(swp_entry_t);
>  extern int swapcache_prepare(swp_entry_t);
>  extern void swap_free(swp_entry_t);
> +extern void swap_nr_free(swp_entry_t entry, int nr_pages);
>  extern void swapcache_free_entries(swp_entry_t *entries, int n);
>  extern int free_swap_and_cache(swp_entry_t);
>  int swap_type_of(dev_t device, sector_t offset);
> @@ -553,6 +554,11 @@ static inline void swap_free(swp_entry_t swp)
>  {
>  }
>
> +void swap_nr_free(swp_entry_t entry, int nr_pages)
> +{
> +
> +}
> +
>  static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
>  {
>  }
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 556ff7347d5f..6321bda96b77 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1335,6 +1335,35 @@ void swap_free(swp_entry_t entry)
>                 __swap_entry_free(p, entry);
>  }
>
> +void swap_nr_free(swp_entry_t entry, int nr_pages)
> +{
> +       int i;
> +       struct swap_cluster_info *ci;
> +       struct swap_info_struct *p;
> +       unsigned type = swp_type(entry);
> +       unsigned long offset = swp_offset(entry);
> +       DECLARE_BITMAP(usage, SWAPFILE_CLUSTER) = { 0 };
> +
> +       VM_BUG_ON(offset % SWAPFILE_CLUSTER + nr_pages > SWAPFILE_CLUSTER);

BUG_ON here seems a bit too developer originated. Maybe warn once and
roll back to free one by one?

How big is your typical SWAPFILE_CUSTER and nr_pages typically in arm?

I ask this question because nr_ppages > 64, that is a totally
different game, we can completely  bypass the swap cache slots.

> +
> +       if (nr_pages == 1) {
> +               swap_free(entry);
> +               return;
> +       }
> +
> +       p = _swap_info_get(entry);
> +
> +       ci = lock_cluster(p, offset);
> +       for (i = 0; i < nr_pages; i++) {
> +               if (__swap_entry_free_locked(p, offset + i, 1))
> +                       __bitmap_set(usage, i, 1);
> +       }
> +       unlock_cluster(ci);
> +
> +       for_each_clear_bit(i, usage, nr_pages)
> +               free_swap_slot(swp_entry(type, offset + i));

Notice that free_swap_slot() internal has per CPU cache batching as
well. Every free_swap_slot will get some per_cpu swap slot cache and
cache->lock. There is double batching here.
If the typical batch size here is bigger than 64 entries, we can go
directly to batching swap_entry_free and avoid the free_swap_slot()
batching altogether. Unlike free_swap_slot_entries(), here swap slots
are all from one swap device, there is no need to sort and group the
swap slot by swap devices.

Chris


Chris

> +}
> +
>  /*
>   * Called after dropping swapcache to decrease refcnt to swap entries.
>   */
> --
> 2.34.1
>
>
Barry Song Feb. 26, 2024, 4:47 a.m. UTC | #2
Hi Chris,

Thanks!

On Sat, Jan 27, 2024 at 12:17 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Thu, Jan 18, 2024 at 3:11 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > While swapping in a large folio, we need to free swaps related to the whole
> > folio. To avoid frequently acquiring and releasing swap locks, it is better
> > to introduce an API for batched free.
> >
> > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> > Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  include/linux/swap.h |  6 ++++++
> >  mm/swapfile.c        | 29 +++++++++++++++++++++++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 4db00ddad261..31a4ee2dcd1c 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -478,6 +478,7 @@ extern void swap_shmem_alloc(swp_entry_t);
> >  extern int swap_duplicate(swp_entry_t);
> >  extern int swapcache_prepare(swp_entry_t);
> >  extern void swap_free(swp_entry_t);
> > +extern void swap_nr_free(swp_entry_t entry, int nr_pages);
> >  extern void swapcache_free_entries(swp_entry_t *entries, int n);
> >  extern int free_swap_and_cache(swp_entry_t);
> >  int swap_type_of(dev_t device, sector_t offset);
> > @@ -553,6 +554,11 @@ static inline void swap_free(swp_entry_t swp)
> >  {
> >  }
> >
> > +void swap_nr_free(swp_entry_t entry, int nr_pages)
> > +{
> > +
> > +}
> > +
> >  static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
> >  {
> >  }
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 556ff7347d5f..6321bda96b77 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -1335,6 +1335,35 @@ void swap_free(swp_entry_t entry)
> >                 __swap_entry_free(p, entry);
> >  }
> >
> > +void swap_nr_free(swp_entry_t entry, int nr_pages)
> > +{
> > +       int i;
> > +       struct swap_cluster_info *ci;
> > +       struct swap_info_struct *p;
> > +       unsigned type = swp_type(entry);
> > +       unsigned long offset = swp_offset(entry);
> > +       DECLARE_BITMAP(usage, SWAPFILE_CLUSTER) = { 0 };
> > +
> > +       VM_BUG_ON(offset % SWAPFILE_CLUSTER + nr_pages > SWAPFILE_CLUSTER);
>
> BUG_ON here seems a bit too developer originated. Maybe warn once and
> roll back to free one by one?

The function is used only for the case we are quite sure we are freeing
some contiguous swap entries within a cluster. if it is not the case,
we will need an array of entries[]. will people be more comfortable to
have a WARN_ON instead? but the problem is if that really happens,
it is a bug, WARN isn't enough.

>
> How big is your typical SWAPFILE_CUSTER and nr_pages typically in arm?

My case is SWAPFILE_CLUSTER  = HPAGE_PMD_NR = 2MB/4KB = 512.

>
> I ask this question because nr_ppages > 64, that is a totally
> different game, we can completely  bypass the swap cache slots.
>

I agree we have a chance to bypass slot cache if nr_pages is bigger than
SWAP_SLOTS_CACHE_SIZE. on the other hand, even when nr_pages <
64, we still have a good chance to optimize free_swap_slot() by batching
as there are many spin_lock and sort() for each single entry.


> > +
> > +       if (nr_pages == 1) {
> > +               swap_free(entry);
> > +               return;
> > +       }
> > +
> > +       p = _swap_info_get(entry);
> > +
> > +       ci = lock_cluster(p, offset);
> > +       for (i = 0; i < nr_pages; i++) {
> > +               if (__swap_entry_free_locked(p, offset + i, 1))
> > +                       __bitmap_set(usage, i, 1);
> > +       }
> > +       unlock_cluster(ci);
> > +
> > +       for_each_clear_bit(i, usage, nr_pages)
> > +               free_swap_slot(swp_entry(type, offset + i));
>
> Notice that free_swap_slot() internal has per CPU cache batching as
> well. Every free_swap_slot will get some per_cpu swap slot cache and
> cache->lock. There is double batching here.
> If the typical batch size here is bigger than 64 entries, we can go
> directly to batching swap_entry_free and avoid the free_swap_slot()
> batching altogether. Unlike free_swap_slot_entries(), here swap slots
> are all from one swap device, there is no need to sort and group the
> swap slot by swap devices.

I agree.  you are completely right!
However, to make the patchset smaller at the beginning, I prefer
these optimizations to be deferred as a separate patchset after this one.

>
> Chris
>
>
> Chris
>
> > +}
> > +
> >  /*
> >   * Called after dropping swapcache to decrease refcnt to swap entries.
> >   */
> > --
> > 2.34.1

Thanks
Barry
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..31a4ee2dcd1c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -478,6 +478,7 @@  extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
+extern void swap_nr_free(swp_entry_t entry, int nr_pages);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern int free_swap_and_cache(swp_entry_t);
 int swap_type_of(dev_t device, sector_t offset);
@@ -553,6 +554,11 @@  static inline void swap_free(swp_entry_t swp)
 {
 }
 
+void swap_nr_free(swp_entry_t entry, int nr_pages)
+{
+
+}
+
 static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
 {
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..6321bda96b77 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1335,6 +1335,35 @@  void swap_free(swp_entry_t entry)
 		__swap_entry_free(p, entry);
 }
 
+void swap_nr_free(swp_entry_t entry, int nr_pages)
+{
+	int i;
+	struct swap_cluster_info *ci;
+	struct swap_info_struct *p;
+	unsigned type = swp_type(entry);
+	unsigned long offset = swp_offset(entry);
+	DECLARE_BITMAP(usage, SWAPFILE_CLUSTER) = { 0 };
+
+	VM_BUG_ON(offset % SWAPFILE_CLUSTER + nr_pages > SWAPFILE_CLUSTER);
+
+	if (nr_pages == 1) {
+		swap_free(entry);
+		return;
+	}
+
+	p = _swap_info_get(entry);
+
+	ci = lock_cluster(p, offset);
+	for (i = 0; i < nr_pages; i++) {
+		if (__swap_entry_free_locked(p, offset + i, 1))
+			__bitmap_set(usage, i, 1);
+	}
+	unlock_cluster(ci);
+
+	for_each_clear_bit(i, usage, nr_pages)
+		free_swap_slot(swp_entry(type, offset + i));
+}
+
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */