diff mbox series

[v5,1/4] mm: swap: introduce swapcache_prepare_nr and swapcache_clear_nr for large folios swap-in

Message ID 20240726094618.401593-2-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series mm: support mTHP swap-in for zRAM-like swapfile | expand

Commit Message

Barry Song July 26, 2024, 9:46 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
one entry only, to support large folio swap-in, we need to handle multiple
swap entries.

To optimize stack usage, we iterate twice in __swap_duplicate_nr(): the
first time to verify that all entries are valid, and the second time to
apply the modifications to the entries.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 include/linux/swap.h |   9 +++-
 mm/swap.h            |  10 ++++-
 mm/swapfile.c        | 102 ++++++++++++++++++++++++++-----------------
 3 files changed, 77 insertions(+), 44 deletions(-)

Comments

Baolin Wang July 30, 2024, 3 a.m. UTC | #1
Hi Barry,

On 2024/7/26 17:46, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
> one entry only, to support large folio swap-in, we need to handle multiple
> swap entries.
> 
> To optimize stack usage, we iterate twice in __swap_duplicate_nr(): the
> first time to verify that all entries are valid, and the second time to
> apply the modifications to the entries.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>

LGTM. Feel free to add:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

By the way, my shmem swap patchset[1] also relies on this patch, so I 
wonder if it's possible to merge this patch into the mm-unstable branch 
first (if other patches still need discussion), to make it easier for me 
to rebase and resend my patch set? Thanks.

[1] 
https://lore.kernel.org/all/cover.1720079976.git.baolin.wang@linux.alibaba.com/
Matthew Wilcox July 30, 2024, 3:11 a.m. UTC | #2
On Fri, Jul 26, 2024 at 09:46:15PM +1200, Barry Song wrote:
> +static inline int swapcache_prepare(swp_entry_t entry)
> +{
> +	return swapcache_prepare_nr(entry, 1);
> +}

Same comment as 2/4 -- there are only two callers of swapcache_prepre().
Just make that take the 'nr' argument and change both callers to pass 1.
Barry Song July 30, 2024, 3:15 a.m. UTC | #3
On Tue, Jul 30, 2024 at 11:11 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Jul 26, 2024 at 09:46:15PM +1200, Barry Song wrote:
> > +static inline int swapcache_prepare(swp_entry_t entry)
> > +{
> > +     return swapcache_prepare_nr(entry, 1);
> > +}
>
> Same comment as 2/4 -- there are only two callers of swapcache_prepre().
> Just make that take the 'nr' argument and change both callers to pass 1.

make sense to me. As Baolin also needs this patch for shmem, I'm going
to separate this one from this series and send a new version with the
suggested change so that Andrew can pull it earlier.

Thanks
Barry
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba7ea95d1c57..f1b28fd04533 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -480,7 +480,7 @@  extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
+extern int swapcache_prepare_nr(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
@@ -554,7 +554,7 @@  static inline int swap_duplicate(swp_entry_t swp)
 	return 0;
 }
 
-static inline int swapcache_prepare(swp_entry_t swp)
+static inline int swapcache_prepare_nr(swp_entry_t swp, int nr)
 {
 	return 0;
 }
@@ -612,6 +612,11 @@  static inline void swap_free(swp_entry_t entry)
 	swap_free_nr(entry, 1);
 }
 
+static inline int swapcache_prepare(swp_entry_t entry)
+{
+	return swapcache_prepare_nr(entry, 1);
+}
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/swap.h b/mm/swap.h
index baa1fa946b34..81ff7eb0be9c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -59,7 +59,7 @@  void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
+void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
 struct folio *swap_cache_get_folio(swp_entry_t entry,
 		struct vm_area_struct *vma, unsigned long addr);
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -120,7 +120,7 @@  static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 	return 0;
 }
 
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+static inline void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr)
 {
 }
 
@@ -172,4 +172,10 @@  static inline unsigned int folio_swap_flags(struct folio *folio)
 	return 0;
 }
 #endif /* CONFIG_SWAP */
+
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+	swapcache_clear_nr(si, entry, 1);
+}
+
 #endif /* _MM_SWAP_H */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5f73a8553371..e688e46f1c62 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3363,7 +3363,7 @@  void si_swapinfo(struct sysinfo *val)
 }
 
 /*
- * Verify that a swap entry is valid and increment its swap map count.
+ * Verify that nr swap entries are valid and increment their swap map counts.
  *
  * Returns error code in following case.
  * - success -> 0
@@ -3373,66 +3373,88 @@  void si_swapinfo(struct sysinfo *val)
  * - swap-cache reference is requested but the entry is not used. -> ENOENT
  * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
  */
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate_nr(swp_entry_t entry, unsigned char usage, int nr)
 {
 	struct swap_info_struct *p;
 	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned char count;
 	unsigned char has_cache;
-	int err;
+	int err, i;
 
 	p = swp_swap_info(entry);
 
 	offset = swp_offset(entry);
+	VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
 	ci = lock_cluster_or_swap_info(p, offset);
 
-	count = p->swap_map[offset];
+	err = 0;
+	for (i = 0; i < nr; i++) {
+		count = p->swap_map[offset + i];
 
-	/*
-	 * swapin_readahead() doesn't check if a swap entry is valid, so the
-	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
-	 */
-	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
-		err = -ENOENT;
-		goto unlock_out;
-	}
+		/*
+		 * swapin_readahead() doesn't check if a swap entry is valid, so the
+		 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+		 */
+		if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+			err = -ENOENT;
+			goto unlock_out;
+		}
 
-	has_cache = count & SWAP_HAS_CACHE;
-	count &= ~SWAP_HAS_CACHE;
-	err = 0;
+		has_cache = count & SWAP_HAS_CACHE;
+		count &= ~SWAP_HAS_CACHE;
 
-	if (usage == SWAP_HAS_CACHE) {
+		if (usage == SWAP_HAS_CACHE) {
+			/* set SWAP_HAS_CACHE if there is no cache and entry is used */
+			if (!has_cache && count)
+				continue;
+			else if (has_cache)		/* someone else added cache */
+				err = -EEXIST;
+			else				/* no users remaining */
+				err = -ENOENT;
 
-		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
-		if (!has_cache && count)
-			has_cache = SWAP_HAS_CACHE;
-		else if (has_cache)		/* someone else added cache */
-			err = -EEXIST;
-		else				/* no users remaining */
-			err = -ENOENT;
+		} else if (count || has_cache) {
 
-	} else if (count || has_cache) {
+			if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+				continue;
+			else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+				err = -EINVAL;
+			else if (swap_count_continued(p, offset + i, count))
+				continue;
+			else
+				err = -ENOMEM;
+		} else
+			err = -ENOENT;			/* unused swap entry */
 
-		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+		if (err)
+			goto unlock_out;
+	}
+
+	for (i = 0; i < nr; i++) {
+		count = p->swap_map[offset + i];
+		has_cache = count & SWAP_HAS_CACHE;
+		count &= ~SWAP_HAS_CACHE;
+
+		if (usage == SWAP_HAS_CACHE)
+			has_cache = SWAP_HAS_CACHE;
+		else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
 			count += usage;
-		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
-			err = -EINVAL;
-		else if (swap_count_continued(p, offset, count))
-			count = COUNT_CONTINUED;
 		else
-			err = -ENOMEM;
-	} else
-		err = -ENOENT;			/* unused swap entry */
+			count = COUNT_CONTINUED;
 
-	if (!err)
-		WRITE_ONCE(p->swap_map[offset], count | has_cache);
+		WRITE_ONCE(p->swap_map[offset + i], count | has_cache);
+	}
 
 unlock_out:
 	unlock_cluster_or_swap_info(p, ci);
 	return err;
 }
 
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+	return __swap_duplicate_nr(entry, usage, 1);
+}
+
 /*
  * Help swapoff by noting that swap entry belongs to shmem/tmpfs
  * (in which case its reference count is never incremented).
@@ -3459,23 +3481,23 @@  int swap_duplicate(swp_entry_t entry)
 }
 
 /*
- * @entry: swap entry for which we allocate swap cache.
+ * @entry: first swap entry from which we allocate nr swap cache.
  *
- * Called when allocating swap cache for existing swap entry,
+ * Called when allocating swap cache for existing swap entries,
  * This can return error codes. Returns 0 at success.
  * -EEXIST means there is a swap cache.
  * Note: return code is different from swap_duplicate().
  */
-int swapcache_prepare(swp_entry_t entry)
+int swapcache_prepare_nr(swp_entry_t entry, int nr)
 {
-	return __swap_duplicate(entry, SWAP_HAS_CACHE);
+	return __swap_duplicate_nr(entry, SWAP_HAS_CACHE, nr);
 }
 
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr)
 {
 	unsigned long offset = swp_offset(entry);
 
-	cluster_swap_free_nr(si, offset, 1, SWAP_HAS_CACHE);
+	cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
 }
 
 struct swap_info_struct *swp_swap_info(swp_entry_t entry)