diff mbox series

[v7,2/7] mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()

Message ID 20240408183946.2991168-3-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series Swap-out mTHP without splitting | expand

Commit Message

Ryan Roberts April 8, 2024, 6:39 p.m. UTC
Now that we no longer have a convenient flag in the cluster to determine
if a folio is large, free_swap_and_cache() will take a reference and
lock a large folio much more often, which could lead to contention and
(e.g.) failure to split large folios, etc.

Let's solve that problem by batch freeing swap and cache with a new
function, free_swap_and_cache_nr(), to free a contiguous range of swap
entries together. This allows us to first drop a reference to each swap
slot before we try to release the cache folio. This means we only try to
release the folio once, only taking the reference and lock once - much
better than the previous 512 times for the 2M THP case.

Contiguous swap entries are gathered in zap_pte_range() and
madvise_free_pte_range() in a similar way to how present ptes are
already gathered in zap_pte_range().

While we are at it, let's simplify by converting the return type of both
functions to void. The return value was used only by zap_pte_range() to
print a bad pte, and was ignored by everyone else, so the extra
reporting wasn't exactly guaranteed. We will still get the warning with
most of the information from get_swap_device(). With the batch version,
we wouldn't know which pte was bad anyway so could print the wrong one.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 include/linux/pgtable.h | 29 ++++++++++++
 include/linux/swap.h    | 12 +++--
 mm/internal.h           | 63 ++++++++++++++++++++++++++
 mm/madvise.c            | 12 +++--
 mm/memory.c             | 13 +++---
 mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
 6 files changed, 195 insertions(+), 31 deletions(-)

Comments

David Hildenbrand April 9, 2024, 7:34 a.m. UTC | #1
On 08.04.24 20:39, Ryan Roberts wrote:
> Now that we no longer have a convenient flag in the cluster to determine
> if a folio is large, free_swap_and_cache() will take a reference and
> lock a large folio much more often, which could lead to contention and
> (e.g.) failure to split large folios, etc.
> 
> Let's solve that problem by batch freeing swap and cache with a new
> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> entries together. This allows us to first drop a reference to each swap
> slot before we try to release the cache folio. This means we only try to
> release the folio once, only taking the reference and lock once - much
> better than the previous 512 times for the 2M THP case.
> 
> Contiguous swap entries are gathered in zap_pte_range() and
> madvise_free_pte_range() in a similar way to how present ptes are
> already gathered in zap_pte_range().
> 
> While we are at it, let's simplify by converting the return type of both
> functions to void. The return value was used only by zap_pte_range() to
> print a bad pte, and was ignored by everyone else, so the extra
> reporting wasn't exactly guaranteed. We will still get the warning with
> most of the information from get_swap_device(). With the batch version,
> we wouldn't know which pte was bad anyway so could print the wrong one.
> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>   include/linux/pgtable.h | 29 ++++++++++++
>   include/linux/swap.h    | 12 +++--
>   mm/internal.h           | 63 ++++++++++++++++++++++++++
>   mm/madvise.c            | 12 +++--
>   mm/memory.c             | 13 +++---
>   mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
>   6 files changed, 195 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a3fc8150b047..75096025fe52 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>   }
>   #endif
>   
> +#ifndef clear_not_present_full_ptes
> +/**
> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> + *				 consecutive in the pgtable.
> + * @mm: Address space the ptes represent.
> + * @addr: Address of the first pte.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over pte_clear_not_present_full().
> + *
> + * Context: The caller holds the page table lock.  The PTEs are all not present.
> + * The PTEs are all in the same PMD.
> + */
> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> +		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> +	for (;;) {
> +		pte_clear_not_present_full(mm, addr, ptep, full);
> +		if (--nr == 0)
> +			break;
> +		ptep++;
> +		addr += PAGE_SIZE;
> +	}
> +}
> +#endif
> +
>   #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>   extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>   			      unsigned long address,
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index f6f78198f000..5737236dc3ce 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>   extern int swapcache_prepare(swp_entry_t);
>   extern void swap_free(swp_entry_t);
>   extern void swapcache_free_entries(swp_entry_t *entries, int n);
> -extern int free_swap_and_cache(swp_entry_t);
> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>   int swap_type_of(dev_t device, sector_t offset);
>   int find_first_swap(dev_t *device);
>   extern unsigned int count_swap_pages(int, int);
> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>   #define free_pages_and_swap_cache(pages, nr) \
>   	release_pages((pages), (nr));
>   
> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> +{
> +}
>   
>   static inline void free_swap_cache(struct folio *folio)
>   {
> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>   }
>   #endif /* CONFIG_SWAP */
>   
> +static inline void free_swap_and_cache(swp_entry_t entry)
> +{
> +	free_swap_and_cache_nr(entry, 1);
> +}
> +
>   #ifdef CONFIG_MEMCG
>   static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>   {
> diff --git a/mm/internal.h b/mm/internal.h
> index 3bdc8693b54f..de68705624b0 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -11,6 +11,8 @@
>   #include <linux/mm.h>
>   #include <linux/pagemap.h>
>   #include <linux/rmap.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
>   #include <linux/tracepoint-defs.h>
>   
>   struct folio_batch;
> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>   
>   	return min(ptep - start_ptep, max_nr);
>   }
> +
> +/**
> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> + * @pte: The initial pte state; is_swap_pte(pte) must be true.

Likely we also want non_swap_entry() to be false.

> + *
> + * Increments the swap offset, while maintaining all other fields, including
> + * swap type, and any swp pte bits. The resulting pte is returned.
> + */
> +static inline pte_t pte_next_swp_offset(pte_t pte)
> +{
> +	swp_entry_t entry = pte_to_swp_entry(pte);
> +	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> +						   swp_offset(entry) + 1));
> +
> +	if (pte_swp_soft_dirty(pte))
> +		new = pte_swp_mksoft_dirty(new);
> +	if (pte_swp_exclusive(pte))
> +		new = pte_swp_mkexclusive(new);
> +	if (pte_swp_uffd_wp(pte))
> +		new = pte_swp_mkuffd_wp(new);
> +
> +	return new;
> +}


Acked-by: David Hildenbrand <david@redhat.com>
Barry Song April 9, 2024, 8:51 a.m. UTC | #2
On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Now that we no longer have a convenient flag in the cluster to determine
> if a folio is large, free_swap_and_cache() will take a reference and
> lock a large folio much more often, which could lead to contention and
> (e.g.) failure to split large folios, etc.
>
> Let's solve that problem by batch freeing swap and cache with a new
> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> entries together. This allows us to first drop a reference to each swap
> slot before we try to release the cache folio. This means we only try to
> release the folio once, only taking the reference and lock once - much
> better than the previous 512 times for the 2M THP case.
>
> Contiguous swap entries are gathered in zap_pte_range() and
> madvise_free_pte_range() in a similar way to how present ptes are
> already gathered in zap_pte_range().
>
> While we are at it, let's simplify by converting the return type of both
> functions to void. The return value was used only by zap_pte_range() to
> print a bad pte, and was ignored by everyone else, so the extra
> reporting wasn't exactly guaranteed. We will still get the warning with
> most of the information from get_swap_device(). With the batch version,
> we wouldn't know which pte was bad anyway so could print the wrong one.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  include/linux/pgtable.h | 29 ++++++++++++
>  include/linux/swap.h    | 12 +++--
>  mm/internal.h           | 63 ++++++++++++++++++++++++++
>  mm/madvise.c            | 12 +++--
>  mm/memory.c             | 13 +++---
>  mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
>  6 files changed, 195 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a3fc8150b047..75096025fe52 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>  }
>  #endif
>
> +#ifndef clear_not_present_full_ptes
> +/**
> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> + *                              consecutive in the pgtable.
> + * @mm: Address space the ptes represent.
> + * @addr: Address of the first pte.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over pte_clear_not_present_full().
> + *
> + * Context: The caller holds the page table lock.  The PTEs are all not present.
> + * The PTEs are all in the same PMD.
> + */
> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> +       for (;;) {
> +               pte_clear_not_present_full(mm, addr, ptep, full);
> +               if (--nr == 0)
> +                       break;
> +               ptep++;
> +               addr += PAGE_SIZE;
> +       }
> +}
> +#endif
> +
>  #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>  extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>                               unsigned long address,
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index f6f78198f000..5737236dc3ce 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>  extern int swapcache_prepare(swp_entry_t);
>  extern void swap_free(swp_entry_t);
>  extern void swapcache_free_entries(swp_entry_t *entries, int n);
> -extern int free_swap_and_cache(swp_entry_t);
> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>  int swap_type_of(dev_t device, sector_t offset);
>  int find_first_swap(dev_t *device);
>  extern unsigned int count_swap_pages(int, int);
> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>  #define free_pages_and_swap_cache(pages, nr) \
>         release_pages((pages), (nr));
>
> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> +{
> +}
>
>  static inline void free_swap_cache(struct folio *folio)
>  {
> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>  }
>  #endif /* CONFIG_SWAP */
>
> +static inline void free_swap_and_cache(swp_entry_t entry)
> +{
> +       free_swap_and_cache_nr(entry, 1);
> +}
> +
>  #ifdef CONFIG_MEMCG
>  static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index 3bdc8693b54f..de68705624b0 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -11,6 +11,8 @@
>  #include <linux/mm.h>
>  #include <linux/pagemap.h>
>  #include <linux/rmap.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
>  #include <linux/tracepoint-defs.h>
>
>  struct folio_batch;
> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>
>         return min(ptep - start_ptep, max_nr);
>  }
> +
> +/**
> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> + *
> + * Increments the swap offset, while maintaining all other fields, including
> + * swap type, and any swp pte bits. The resulting pte is returned.
> + */
> +static inline pte_t pte_next_swp_offset(pte_t pte)
> +{
> +       swp_entry_t entry = pte_to_swp_entry(pte);
> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> +                                                  swp_offset(entry) + 1));
> +
> +       if (pte_swp_soft_dirty(pte))
> +               new = pte_swp_mksoft_dirty(new);
> +       if (pte_swp_exclusive(pte))
> +               new = pte_swp_mkexclusive(new);
> +       if (pte_swp_uffd_wp(pte))
> +               new = pte_swp_mkuffd_wp(new);

I don't quite understand this. If this page table entry is exclusive,
will its subsequent page table entry also be exclusive without
question?
in try_to_unmap_one, exclusive is per-subpage but not per-folio:

                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

same questions also for diry, wp etc.

> +
> +       return new;
> +}
> +
> +/**
> + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
> + * @start_ptep: Page table pointer for the first entry.
> + * @max_nr: The maximum number of table entries to consider.
> + * @pte: Page table entry for the first entry.
> + *
> + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
> + * containing swap entries all with consecutive offsets and targeting the same
> + * swap type, all with matching swp pte bits.
> + *
> + * max_nr must be at least one and must be limited by the caller so scanning
> + * cannot exceed a single page table.
> + *
> + * Return: the number of table entries in the batch.
> + */
> +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> +{
> +       pte_t expected_pte = pte_next_swp_offset(pte);
> +       const pte_t *end_ptep = start_ptep + max_nr;
> +       pte_t *ptep = start_ptep + 1;
> +
> +       VM_WARN_ON(max_nr < 1);
> +       VM_WARN_ON(!is_swap_pte(pte));
> +       VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> +
> +       while (ptep < end_ptep) {
> +               pte = ptep_get(ptep);
> +
> +               if (!pte_same(pte, expected_pte))
> +                       break;
> +
> +               expected_pte = pte_next_swp_offset(expected_pte);
> +               ptep++;
> +       }
> +
> +       return ptep - start_ptep;
> +}
>  #endif /* CONFIG_MMU */
>
>  void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 1f77a51baaac..5011ecb24344 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>         struct folio *folio;
>         int nr_swap = 0;
>         unsigned long next;
> +       int nr, max_nr;
>
>         next = pmd_addr_end(addr, end);
>         if (pmd_trans_huge(*pmd))
> @@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>                 return 0;
>         flush_tlb_batched_pending(mm);
>         arch_enter_lazy_mmu_mode();
> -       for (; addr != end; pte++, addr += PAGE_SIZE) {
> +       for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
> +               nr = 1;
>                 ptent = ptep_get(pte);
>
>                 if (pte_none(ptent))
> @@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>
>                         entry = pte_to_swp_entry(ptent);
>                         if (!non_swap_entry(entry)) {
> -                               nr_swap--;
> -                               free_swap_and_cache(entry);
> -                               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> +                               max_nr = (end - addr) / PAGE_SIZE;
> +                               nr = swap_pte_batch(pte, max_nr, ptent);
> +                               nr_swap -= nr;
> +                               free_swap_and_cache_nr(entry, nr);
> +                               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
>                         } else if (is_hwpoison_entry(entry) ||
>                                    is_poisoned_swp_entry(entry)) {
>                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> diff --git a/mm/memory.c b/mm/memory.c
> index b98e4d907a14..0db2aa066a5a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>                                 folio_remove_rmap_pte(folio, page, vma);
>                         folio_put(folio);
>                 } else if (!non_swap_entry(entry)) {
> -                       /* Genuine swap entry, hence a private anon page */
> +                       max_nr = (end - addr) / PAGE_SIZE;
> +                       nr = swap_pte_batch(pte, max_nr, ptent);
> +                       /* Genuine swap entries, hence a private anon pages */
>                         if (!should_zap_cows(details))
>                                 continue;
> -                       rss[MM_SWAPENTS]--;
> -                       if (unlikely(!free_swap_and_cache(entry)))
> -                               print_bad_pte(vma, addr, ptent, NULL);
> +                       rss[MM_SWAPENTS] -= nr;
> +                       free_swap_and_cache_nr(entry, nr);
>                 } else if (is_migration_entry(entry)) {
>                         folio = pfn_swap_entry_folio(entry);
>                         if (!should_zap_folio(details, folio))
> @@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>                         pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
>                         WARN_ON_ONCE(1);
>                 }
> -               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> -               zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
> +               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> +               zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
>         } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
>
>         add_mm_rss_vec(mm, rss);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 1ded6d1dcab4..20c45757f2b2 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
>  /* Reclaim the swap entry if swap is getting full*/
>  #define TTRS_FULL              0x4
>
> -/* returns 1 if swap entry is freed */
> +/*
> + * returns number of pages in the folio that backs the swap entry. If positive,
> + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> + * folio was associated with the swap entry.
> + */
>  static int __try_to_reclaim_swap(struct swap_info_struct *si,
>                                  unsigned long offset, unsigned long flags)
>  {
> @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>                         ret = folio_free_swap(folio);
>                 folio_unlock(folio);
>         }
> +       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
>         folio_put(folio);
>         return ret;
>  }
> @@ -895,7 +900,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
>                 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
>                 spin_lock(&si->lock);
>                 /* entry was freed successfully, try to use this again */
> -               if (swap_was_freed)
> +               if (swap_was_freed > 0)
>                         goto checks;
>                 goto scan; /* check next one */
>         }
> @@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
>         return true;
>  }
>
> -/*
> - * Free the swap entry like above, but also try to
> - * free the page cache entry if it is the last user.
> +/**
> + * free_swap_and_cache_nr() - Release reference on range of swap entries and
> + *                            reclaim their cache if no more references remain.
> + * @entry: First entry of range.
> + * @nr: Number of entries in range.
> + *
> + * For each swap entry in the contiguous range, release a reference. If any swap
> + * entries become free, try to reclaim their underlying folios, if present. The
> + * offset range is defined by [entry.offset, entry.offset + nr).
>   */
> -int free_swap_and_cache(swp_entry_t entry)
> +void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>  {
> -       struct swap_info_struct *p;
> +       const unsigned long start_offset = swp_offset(entry);
> +       const unsigned long end_offset = start_offset + nr;
> +       unsigned int type = swp_type(entry);
> +       struct swap_info_struct *si;
> +       bool any_only_cache = false;
> +       unsigned long offset;
>         unsigned char count;
>
>         if (non_swap_entry(entry))
> -               return 1;
> +               return;
>
> -       p = get_swap_device(entry);
> -       if (p) {
> -               if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
> -                       put_swap_device(p);
> -                       return 0;
> +       si = get_swap_device(entry);
> +       if (!si)
> +               return;
> +
> +       if (WARN_ON(end_offset > si->max))
> +               goto out;
> +
> +       /*
> +        * First free all entries in the range.
> +        */
> +       for (offset = start_offset; offset < end_offset; offset++) {
> +               if (data_race(si->swap_map[offset])) {
> +                       count = __swap_entry_free(si, swp_entry(type, offset));
> +                       if (count == SWAP_HAS_CACHE)
> +                               any_only_cache = true;
> +               } else {
> +                       WARN_ON_ONCE(1);
>                 }
> +       }
> +
> +       /*
> +        * Short-circuit the below loop if none of the entries had their
> +        * reference drop to zero.
> +        */
> +       if (!any_only_cache)
> +               goto out;
>
> -               count = __swap_entry_free(p, entry);
> -               if (count == SWAP_HAS_CACHE)
> -                       __try_to_reclaim_swap(p, swp_offset(entry),
> +       /*
> +        * Now go back over the range trying to reclaim the swap cache. This is
> +        * more efficient for large folios because we will only try to reclaim
> +        * the swap once per folio in the common case. If we do
> +        * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
> +        * latter will get a reference and lock the folio for every individual
> +        * page but will only succeed once the swap slot for every subpage is
> +        * zero.
> +        */
> +       for (offset = start_offset; offset < end_offset; offset += nr) {
> +               nr = 1;
> +               if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
> +                       /*
> +                        * Folios are always naturally aligned in swap so
> +                        * advance forward to the next boundary. Zero means no
> +                        * folio was found for the swap entry, so advance by 1
> +                        * in this case. Negative value means folio was found
> +                        * but could not be reclaimed. Here we can still advance
> +                        * to the next boundary.
> +                        */
> +                       nr = __try_to_reclaim_swap(si, offset,
>                                               TTRS_UNMAPPED | TTRS_FULL);
> -               put_swap_device(p);
> +                       if (nr == 0)
> +                               nr = 1;
> +                       else if (nr < 0)
> +                               nr = -nr;
> +                       nr = ALIGN(offset + 1, nr) - offset;
> +               }
>         }
> -       return p != NULL;
> +
> +out:
> +       put_swap_device(si);
>  }
>
>  #ifdef CONFIG_HIBERNATION
> --
> 2.25.1
>

Thanks
Barry
Barry Song April 9, 2024, 9:22 a.m. UTC | #3
On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >
> > Now that we no longer have a convenient flag in the cluster to determine
> > if a folio is large, free_swap_and_cache() will take a reference and
> > lock a large folio much more often, which could lead to contention and
> > (e.g.) failure to split large folios, etc.
> >
> > Let's solve that problem by batch freeing swap and cache with a new
> > function, free_swap_and_cache_nr(), to free a contiguous range of swap
> > entries together. This allows us to first drop a reference to each swap
> > slot before we try to release the cache folio. This means we only try to
> > release the folio once, only taking the reference and lock once - much
> > better than the previous 512 times for the 2M THP case.
> >
> > Contiguous swap entries are gathered in zap_pte_range() and
> > madvise_free_pte_range() in a similar way to how present ptes are
> > already gathered in zap_pte_range().
> >
> > While we are at it, let's simplify by converting the return type of both
> > functions to void. The return value was used only by zap_pte_range() to
> > print a bad pte, and was ignored by everyone else, so the extra
> > reporting wasn't exactly guaranteed. We will still get the warning with
> > most of the information from get_swap_device(). With the batch version,
> > we wouldn't know which pte was bad anyway so could print the wrong one.
> >
> > Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> > ---
> >  include/linux/pgtable.h | 29 ++++++++++++
> >  include/linux/swap.h    | 12 +++--
> >  mm/internal.h           | 63 ++++++++++++++++++++++++++
> >  mm/madvise.c            | 12 +++--
> >  mm/memory.c             | 13 +++---
> >  mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
> >  6 files changed, 195 insertions(+), 31 deletions(-)
> >
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index a3fc8150b047..75096025fe52 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> >  }
> >  #endif
> >
> > +#ifndef clear_not_present_full_ptes
> > +/**
> > + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> > + *                              consecutive in the pgtable.
> > + * @mm: Address space the ptes represent.
> > + * @addr: Address of the first pte.
> > + * @ptep: Page table pointer for the first entry.
> > + * @nr: Number of entries to clear.
> > + * @full: Whether we are clearing a full mm.
> > + *
> > + * May be overridden by the architecture; otherwise, implemented as a simple
> > + * loop over pte_clear_not_present_full().
> > + *
> > + * Context: The caller holds the page table lock.  The PTEs are all not present.
> > + * The PTEs are all in the same PMD.
> > + */
> > +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> > +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> > +{
> > +       for (;;) {
> > +               pte_clear_not_present_full(mm, addr, ptep, full);
> > +               if (--nr == 0)
> > +                       break;
> > +               ptep++;
> > +               addr += PAGE_SIZE;
> > +       }
> > +}
> > +#endif
> > +
> >  #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> >  extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> >                               unsigned long address,
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index f6f78198f000..5737236dc3ce 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> >  extern int swapcache_prepare(swp_entry_t);
> >  extern void swap_free(swp_entry_t);
> >  extern void swapcache_free_entries(swp_entry_t *entries, int n);
> > -extern int free_swap_and_cache(swp_entry_t);
> > +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> >  int swap_type_of(dev_t device, sector_t offset);
> >  int find_first_swap(dev_t *device);
> >  extern unsigned int count_swap_pages(int, int);
> > @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> >  #define free_pages_and_swap_cache(pages, nr) \
> >         release_pages((pages), (nr));
> >
> > -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> > -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> > +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> > +{
> > +}
> >
> >  static inline void free_swap_cache(struct folio *folio)
> >  {
> > @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> >  }
> >  #endif /* CONFIG_SWAP */
> >
> > +static inline void free_swap_and_cache(swp_entry_t entry)
> > +{
> > +       free_swap_and_cache_nr(entry, 1);
> > +}
> > +
> >  #ifdef CONFIG_MEMCG
> >  static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> >  {
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 3bdc8693b54f..de68705624b0 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -11,6 +11,8 @@
> >  #include <linux/mm.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/rmap.h>
> > +#include <linux/swap.h>
> > +#include <linux/swapops.h>
> >  #include <linux/tracepoint-defs.h>
> >
> >  struct folio_batch;
> > @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >
> >         return min(ptep - start_ptep, max_nr);
> >  }
> > +
> > +/**
> > + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> > + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> > + *
> > + * Increments the swap offset, while maintaining all other fields, including
> > + * swap type, and any swp pte bits. The resulting pte is returned.
> > + */
> > +static inline pte_t pte_next_swp_offset(pte_t pte)
> > +{
> > +       swp_entry_t entry = pte_to_swp_entry(pte);
> > +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> > +                                                  swp_offset(entry) + 1));
> > +
> > +       if (pte_swp_soft_dirty(pte))
> > +               new = pte_swp_mksoft_dirty(new);
> > +       if (pte_swp_exclusive(pte))
> > +               new = pte_swp_mkexclusive(new);
> > +       if (pte_swp_uffd_wp(pte))
> > +               new = pte_swp_mkuffd_wp(new);
>
> I don't quite understand this. If this page table entry is exclusive,
> will its subsequent page table entry also be exclusive without
> question?
> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>
>                 anon_exclusive = folio_test_anon(folio) &&
>                                  PageAnonExclusive(subpage);
>
> same questions also for diry, wp etc.

Sorry for the noise. you are right. based on your new version, I think I should
entirely drop:

[PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
all swap entries are exclusive

https://lore.kernel.org/linux-mm/20240409082631.187483-4-21cnbao@gmail.com/

>
> > +
> > +       return new;
> > +}
> > +
> > +/**
> > + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
> > + * @start_ptep: Page table pointer for the first entry.
> > + * @max_nr: The maximum number of table entries to consider.
> > + * @pte: Page table entry for the first entry.
> > + *
> > + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
> > + * containing swap entries all with consecutive offsets and targeting the same
> > + * swap type, all with matching swp pte bits.
> > + *
> > + * max_nr must be at least one and must be limited by the caller so scanning
> > + * cannot exceed a single page table.
> > + *
> > + * Return: the number of table entries in the batch.
> > + */
> > +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> > +{
> > +       pte_t expected_pte = pte_next_swp_offset(pte);
> > +       const pte_t *end_ptep = start_ptep + max_nr;
> > +       pte_t *ptep = start_ptep + 1;
> > +
> > +       VM_WARN_ON(max_nr < 1);
> > +       VM_WARN_ON(!is_swap_pte(pte));
> > +       VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> > +
> > +       while (ptep < end_ptep) {
> > +               pte = ptep_get(ptep);
> > +
> > +               if (!pte_same(pte, expected_pte))
> > +                       break;
> > +
> > +               expected_pte = pte_next_swp_offset(expected_pte);
> > +               ptep++;
> > +       }
> > +
> > +       return ptep - start_ptep;
> > +}
> >  #endif /* CONFIG_MMU */
> >
> >  void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 1f77a51baaac..5011ecb24344 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >         struct folio *folio;
> >         int nr_swap = 0;
> >         unsigned long next;
> > +       int nr, max_nr;
> >
> >         next = pmd_addr_end(addr, end);
> >         if (pmd_trans_huge(*pmd))
> > @@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >                 return 0;
> >         flush_tlb_batched_pending(mm);
> >         arch_enter_lazy_mmu_mode();
> > -       for (; addr != end; pte++, addr += PAGE_SIZE) {
> > +       for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
> > +               nr = 1;
> >                 ptent = ptep_get(pte);
> >
> >                 if (pte_none(ptent))
> > @@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >
> >                         entry = pte_to_swp_entry(ptent);
> >                         if (!non_swap_entry(entry)) {
> > -                               nr_swap--;
> > -                               free_swap_and_cache(entry);
> > -                               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > +                               max_nr = (end - addr) / PAGE_SIZE;
> > +                               nr = swap_pte_batch(pte, max_nr, ptent);
> > +                               nr_swap -= nr;
> > +                               free_swap_and_cache_nr(entry, nr);
> > +                               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> >                         } else if (is_hwpoison_entry(entry) ||
> >                                    is_poisoned_swp_entry(entry)) {
> >                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index b98e4d907a14..0db2aa066a5a 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >                                 folio_remove_rmap_pte(folio, page, vma);
> >                         folio_put(folio);
> >                 } else if (!non_swap_entry(entry)) {
> > -                       /* Genuine swap entry, hence a private anon page */
> > +                       max_nr = (end - addr) / PAGE_SIZE;
> > +                       nr = swap_pte_batch(pte, max_nr, ptent);
> > +                       /* Genuine swap entries, hence a private anon pages */
> >                         if (!should_zap_cows(details))
> >                                 continue;
> > -                       rss[MM_SWAPENTS]--;
> > -                       if (unlikely(!free_swap_and_cache(entry)))
> > -                               print_bad_pte(vma, addr, ptent, NULL);
> > +                       rss[MM_SWAPENTS] -= nr;
> > +                       free_swap_and_cache_nr(entry, nr);
> >                 } else if (is_migration_entry(entry)) {
> >                         folio = pfn_swap_entry_folio(entry);
> >                         if (!should_zap_folio(details, folio))
> > @@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >                         pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
> >                         WARN_ON_ONCE(1);
> >                 }
> > -               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > -               zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
> > +               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> > +               zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
> >         } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
> >
> >         add_mm_rss_vec(mm, rss);
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 1ded6d1dcab4..20c45757f2b2 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
> >  /* Reclaim the swap entry if swap is getting full*/
> >  #define TTRS_FULL              0x4
> >
> > -/* returns 1 if swap entry is freed */
> > +/*
> > + * returns number of pages in the folio that backs the swap entry. If positive,
> > + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> > + * folio was associated with the swap entry.
> > + */
> >  static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >                                  unsigned long offset, unsigned long flags)
> >  {
> > @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >                         ret = folio_free_swap(folio);
> >                 folio_unlock(folio);
> >         }
> > +       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> >         folio_put(folio);
> >         return ret;
> >  }
> > @@ -895,7 +900,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
> >                 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
> >                 spin_lock(&si->lock);
> >                 /* entry was freed successfully, try to use this again */
> > -               if (swap_was_freed)
> > +               if (swap_was_freed > 0)
> >                         goto checks;
> >                 goto scan; /* check next one */
> >         }
> > @@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
> >         return true;
> >  }
> >
> > -/*
> > - * Free the swap entry like above, but also try to
> > - * free the page cache entry if it is the last user.
> > +/**
> > + * free_swap_and_cache_nr() - Release reference on range of swap entries and
> > + *                            reclaim their cache if no more references remain.
> > + * @entry: First entry of range.
> > + * @nr: Number of entries in range.
> > + *
> > + * For each swap entry in the contiguous range, release a reference. If any swap
> > + * entries become free, try to reclaim their underlying folios, if present. The
> > + * offset range is defined by [entry.offset, entry.offset + nr).
> >   */
> > -int free_swap_and_cache(swp_entry_t entry)
> > +void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >  {
> > -       struct swap_info_struct *p;
> > +       const unsigned long start_offset = swp_offset(entry);
> > +       const unsigned long end_offset = start_offset + nr;
> > +       unsigned int type = swp_type(entry);
> > +       struct swap_info_struct *si;
> > +       bool any_only_cache = false;
> > +       unsigned long offset;
> >         unsigned char count;
> >
> >         if (non_swap_entry(entry))
> > -               return 1;
> > +               return;
> >
> > -       p = get_swap_device(entry);
> > -       if (p) {
> > -               if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
> > -                       put_swap_device(p);
> > -                       return 0;
> > +       si = get_swap_device(entry);
> > +       if (!si)
> > +               return;
> > +
> > +       if (WARN_ON(end_offset > si->max))
> > +               goto out;
> > +
> > +       /*
> > +        * First free all entries in the range.
> > +        */
> > +       for (offset = start_offset; offset < end_offset; offset++) {
> > +               if (data_race(si->swap_map[offset])) {
> > +                       count = __swap_entry_free(si, swp_entry(type, offset));
> > +                       if (count == SWAP_HAS_CACHE)
> > +                               any_only_cache = true;
> > +               } else {
> > +                       WARN_ON_ONCE(1);
> >                 }
> > +       }
> > +
> > +       /*
> > +        * Short-circuit the below loop if none of the entries had their
> > +        * reference drop to zero.
> > +        */
> > +       if (!any_only_cache)
> > +               goto out;
> >
> > -               count = __swap_entry_free(p, entry);
> > -               if (count == SWAP_HAS_CACHE)
> > -                       __try_to_reclaim_swap(p, swp_offset(entry),
> > +       /*
> > +        * Now go back over the range trying to reclaim the swap cache. This is
> > +        * more efficient for large folios because we will only try to reclaim
> > +        * the swap once per folio in the common case. If we do
> > +        * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
> > +        * latter will get a reference and lock the folio for every individual
> > +        * page but will only succeed once the swap slot for every subpage is
> > +        * zero.
> > +        */
> > +       for (offset = start_offset; offset < end_offset; offset += nr) {
> > +               nr = 1;
> > +               if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
> > +                       /*
> > +                        * Folios are always naturally aligned in swap so
> > +                        * advance forward to the next boundary. Zero means no
> > +                        * folio was found for the swap entry, so advance by 1
> > +                        * in this case. Negative value means folio was found
> > +                        * but could not be reclaimed. Here we can still advance
> > +                        * to the next boundary.
> > +                        */
> > +                       nr = __try_to_reclaim_swap(si, offset,
> >                                               TTRS_UNMAPPED | TTRS_FULL);
> > -               put_swap_device(p);
> > +                       if (nr == 0)
> > +                               nr = 1;
> > +                       else if (nr < 0)
> > +                               nr = -nr;
> > +                       nr = ALIGN(offset + 1, nr) - offset;
> > +               }
> >         }
> > -       return p != NULL;
> > +
> > +out:
> > +       put_swap_device(si);
> >  }
> >
> >  #ifdef CONFIG_HIBERNATION
> > --
> > 2.25.1
> >
>
> Thanks
> Barry
David Hildenbrand April 9, 2024, 9:24 a.m. UTC | #4
On 09.04.24 11:22, Barry Song wrote:
> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> Now that we no longer have a convenient flag in the cluster to determine
>>> if a folio is large, free_swap_and_cache() will take a reference and
>>> lock a large folio much more often, which could lead to contention and
>>> (e.g.) failure to split large folios, etc.
>>>
>>> Let's solve that problem by batch freeing swap and cache with a new
>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>> entries together. This allows us to first drop a reference to each swap
>>> slot before we try to release the cache folio. This means we only try to
>>> release the folio once, only taking the reference and lock once - much
>>> better than the previous 512 times for the 2M THP case.
>>>
>>> Contiguous swap entries are gathered in zap_pte_range() and
>>> madvise_free_pte_range() in a similar way to how present ptes are
>>> already gathered in zap_pte_range().
>>>
>>> While we are at it, let's simplify by converting the return type of both
>>> functions to void. The return value was used only by zap_pte_range() to
>>> print a bad pte, and was ignored by everyone else, so the extra
>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>> most of the information from get_swap_device(). With the batch version,
>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>   include/linux/pgtable.h | 29 ++++++++++++
>>>   include/linux/swap.h    | 12 +++--
>>>   mm/internal.h           | 63 ++++++++++++++++++++++++++
>>>   mm/madvise.c            | 12 +++--
>>>   mm/memory.c             | 13 +++---
>>>   mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
>>>   6 files changed, 195 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>> index a3fc8150b047..75096025fe52 100644
>>> --- a/include/linux/pgtable.h
>>> +++ b/include/linux/pgtable.h
>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>>   }
>>>   #endif
>>>
>>> +#ifndef clear_not_present_full_ptes
>>> +/**
>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>> + *                              consecutive in the pgtable.
>>> + * @mm: Address space the ptes represent.
>>> + * @addr: Address of the first pte.
>>> + * @ptep: Page table pointer for the first entry.
>>> + * @nr: Number of entries to clear.
>>> + * @full: Whether we are clearing a full mm.
>>> + *
>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>> + * loop over pte_clear_not_present_full().
>>> + *
>>> + * Context: The caller holds the page table lock.  The PTEs are all not present.
>>> + * The PTEs are all in the same PMD.
>>> + */
>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>> +{
>>> +       for (;;) {
>>> +               pte_clear_not_present_full(mm, addr, ptep, full);
>>> +               if (--nr == 0)
>>> +                       break;
>>> +               ptep++;
>>> +               addr += PAGE_SIZE;
>>> +       }
>>> +}
>>> +#endif
>>> +
>>>   #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>>   extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>>                                unsigned long address,
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index f6f78198f000..5737236dc3ce 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>>   extern int swapcache_prepare(swp_entry_t);
>>>   extern void swap_free(swp_entry_t);
>>>   extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>> -extern int free_swap_and_cache(swp_entry_t);
>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>>   int swap_type_of(dev_t device, sector_t offset);
>>>   int find_first_swap(dev_t *device);
>>>   extern unsigned int count_swap_pages(int, int);
>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>>   #define free_pages_and_swap_cache(pages, nr) \
>>>          release_pages((pages), (nr));
>>>
>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>> +{
>>> +}
>>>
>>>   static inline void free_swap_cache(struct folio *folio)
>>>   {
>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>>   }
>>>   #endif /* CONFIG_SWAP */
>>>
>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>> +{
>>> +       free_swap_and_cache_nr(entry, 1);
>>> +}
>>> +
>>>   #ifdef CONFIG_MEMCG
>>>   static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>>   {
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 3bdc8693b54f..de68705624b0 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -11,6 +11,8 @@
>>>   #include <linux/mm.h>
>>>   #include <linux/pagemap.h>
>>>   #include <linux/rmap.h>
>>> +#include <linux/swap.h>
>>> +#include <linux/swapops.h>
>>>   #include <linux/tracepoint-defs.h>
>>>
>>>   struct folio_batch;
>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>
>>>          return min(ptep - start_ptep, max_nr);
>>>   }
>>> +
>>> +/**
>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>> + *
>>> + * Increments the swap offset, while maintaining all other fields, including
>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>> + */
>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>> +{
>>> +       swp_entry_t entry = pte_to_swp_entry(pte);
>>> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>> +                                                  swp_offset(entry) + 1));
>>> +
>>> +       if (pte_swp_soft_dirty(pte))
>>> +               new = pte_swp_mksoft_dirty(new);
>>> +       if (pte_swp_exclusive(pte))
>>> +               new = pte_swp_mkexclusive(new);
>>> +       if (pte_swp_uffd_wp(pte))
>>> +               new = pte_swp_mkuffd_wp(new);
>>
>> I don't quite understand this. If this page table entry is exclusive,
>> will its subsequent page table entry also be exclusive without
>> question?
>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>
>>                  anon_exclusive = folio_test_anon(folio) &&
>>                                   PageAnonExclusive(subpage);
>>
>> same questions also for diry, wp etc.
> 
> Sorry for the noise. you are right. based on your new version, I think I should
> entirely drop:
> 
> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> all swap entries are exclusive

Yes. If we ever want to ignore some bits, we should likely add flags to 
change the behavior, like for folio_pte_batch().

For swapin, you really want the exclusive bits to match, though. 
softdirty and uffd-wp as well at least initially for simplicity.
Barry Song April 9, 2024, 9:41 a.m. UTC | #5
On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 09.04.24 11:22, Barry Song wrote:
> > On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> Now that we no longer have a convenient flag in the cluster to determine
> >>> if a folio is large, free_swap_and_cache() will take a reference and
> >>> lock a large folio much more often, which could lead to contention and
> >>> (e.g.) failure to split large folios, etc.
> >>>
> >>> Let's solve that problem by batch freeing swap and cache with a new
> >>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> >>> entries together. This allows us to first drop a reference to each swap
> >>> slot before we try to release the cache folio. This means we only try to
> >>> release the folio once, only taking the reference and lock once - much
> >>> better than the previous 512 times for the 2M THP case.
> >>>
> >>> Contiguous swap entries are gathered in zap_pte_range() and
> >>> madvise_free_pte_range() in a similar way to how present ptes are
> >>> already gathered in zap_pte_range().
> >>>
> >>> While we are at it, let's simplify by converting the return type of both
> >>> functions to void. The return value was used only by zap_pte_range() to
> >>> print a bad pte, and was ignored by everyone else, so the extra
> >>> reporting wasn't exactly guaranteed. We will still get the warning with
> >>> most of the information from get_swap_device(). With the batch version,
> >>> we wouldn't know which pte was bad anyway so could print the wrong one.
> >>>
> >>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>> ---
> >>>   include/linux/pgtable.h | 29 ++++++++++++
> >>>   include/linux/swap.h    | 12 +++--
> >>>   mm/internal.h           | 63 ++++++++++++++++++++++++++
> >>>   mm/madvise.c            | 12 +++--
> >>>   mm/memory.c             | 13 +++---
> >>>   mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
> >>>   6 files changed, 195 insertions(+), 31 deletions(-)
> >>>
> >>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> >>> index a3fc8150b047..75096025fe52 100644
> >>> --- a/include/linux/pgtable.h
> >>> +++ b/include/linux/pgtable.h
> >>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> >>>   }
> >>>   #endif
> >>>
> >>> +#ifndef clear_not_present_full_ptes
> >>> +/**
> >>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> >>> + *                              consecutive in the pgtable.
> >>> + * @mm: Address space the ptes represent.
> >>> + * @addr: Address of the first pte.
> >>> + * @ptep: Page table pointer for the first entry.
> >>> + * @nr: Number of entries to clear.
> >>> + * @full: Whether we are clearing a full mm.
> >>> + *
> >>> + * May be overridden by the architecture; otherwise, implemented as a simple
> >>> + * loop over pte_clear_not_present_full().
> >>> + *
> >>> + * Context: The caller holds the page table lock.  The PTEs are all not present.
> >>> + * The PTEs are all in the same PMD.
> >>> + */
> >>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> >>> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> >>> +{
> >>> +       for (;;) {
> >>> +               pte_clear_not_present_full(mm, addr, ptep, full);
> >>> +               if (--nr == 0)
> >>> +                       break;
> >>> +               ptep++;
> >>> +               addr += PAGE_SIZE;
> >>> +       }
> >>> +}
> >>> +#endif
> >>> +
> >>>   #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> >>>   extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> >>>                                unsigned long address,
> >>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>> index f6f78198f000..5737236dc3ce 100644
> >>> --- a/include/linux/swap.h
> >>> +++ b/include/linux/swap.h
> >>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> >>>   extern int swapcache_prepare(swp_entry_t);
> >>>   extern void swap_free(swp_entry_t);
> >>>   extern void swapcache_free_entries(swp_entry_t *entries, int n);
> >>> -extern int free_swap_and_cache(swp_entry_t);
> >>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> >>>   int swap_type_of(dev_t device, sector_t offset);
> >>>   int find_first_swap(dev_t *device);
> >>>   extern unsigned int count_swap_pages(int, int);
> >>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> >>>   #define free_pages_and_swap_cache(pages, nr) \
> >>>          release_pages((pages), (nr));
> >>>
> >>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> >>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> >>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>> +{
> >>> +}
> >>>
> >>>   static inline void free_swap_cache(struct folio *folio)
> >>>   {
> >>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> >>>   }
> >>>   #endif /* CONFIG_SWAP */
> >>>
> >>> +static inline void free_swap_and_cache(swp_entry_t entry)
> >>> +{
> >>> +       free_swap_and_cache_nr(entry, 1);
> >>> +}
> >>> +
> >>>   #ifdef CONFIG_MEMCG
> >>>   static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> >>>   {
> >>> diff --git a/mm/internal.h b/mm/internal.h
> >>> index 3bdc8693b54f..de68705624b0 100644
> >>> --- a/mm/internal.h
> >>> +++ b/mm/internal.h
> >>> @@ -11,6 +11,8 @@
> >>>   #include <linux/mm.h>
> >>>   #include <linux/pagemap.h>
> >>>   #include <linux/rmap.h>
> >>> +#include <linux/swap.h>
> >>> +#include <linux/swapops.h>
> >>>   #include <linux/tracepoint-defs.h>
> >>>
> >>>   struct folio_batch;
> >>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >>>
> >>>          return min(ptep - start_ptep, max_nr);
> >>>   }
> >>> +
> >>> +/**
> >>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> >>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> >>> + *
> >>> + * Increments the swap offset, while maintaining all other fields, including
> >>> + * swap type, and any swp pte bits. The resulting pte is returned.
> >>> + */
> >>> +static inline pte_t pte_next_swp_offset(pte_t pte)
> >>> +{
> >>> +       swp_entry_t entry = pte_to_swp_entry(pte);
> >>> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> >>> +                                                  swp_offset(entry) + 1));
> >>> +
> >>> +       if (pte_swp_soft_dirty(pte))
> >>> +               new = pte_swp_mksoft_dirty(new);
> >>> +       if (pte_swp_exclusive(pte))
> >>> +               new = pte_swp_mkexclusive(new);
> >>> +       if (pte_swp_uffd_wp(pte))
> >>> +               new = pte_swp_mkuffd_wp(new);
> >>
> >> I don't quite understand this. If this page table entry is exclusive,
> >> will its subsequent page table entry also be exclusive without
> >> question?
> >> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
> >>
> >>                  anon_exclusive = folio_test_anon(folio) &&
> >>                                   PageAnonExclusive(subpage);
> >>
> >> same questions also for diry, wp etc.
> >
> > Sorry for the noise. you are right. based on your new version, I think I should
> > entirely drop:
> >
> > [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> > all swap entries are exclusive
>
> Yes. If we ever want to ignore some bits, we should likely add flags to
> change the behavior, like for folio_pte_batch().
>
> For swapin, you really want the exclusive bits to match, though.

I am not quite sure I definitely need exclusive bits to match. i can either
drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
swpin won't reuse the large folio, but it can still entirely map it read-only):

diff --git a/mm/internal.h b/mm/internal.h
index cae39c372bfc..5726e729c9ee 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
*start_ptep, int max_nr, pte_t pte,
                *any_shared |= !pte_swp_exclusive(pte);

        while (ptep < end_ptep) {
+               pte_t ignore_exclusive_pte;
+               pte_t ignore_exclusive_expected_pte;
                pte = ptep_get(ptep);

-               if (!pte_same(pte, expected_pte))
-                       break;
+               if (any_shared) {
+                       ignore_exclusive_pte = pte;
+                       ignore_exclusive_expected_pte = expected_pte;
+                       ignore_exclusive_pte =
pte_swp_clear_exclusive(ignore_exclusive_pte);
+                       ignore_exclusive_expected_pte =
pte_swp_clear_exclusive(expected_pte);
+
+                       if (!pte_same(ignore_exclusive_pte,
ignore_exclusive_expected_pte))
+                               break;
+               } else {
+                       if (!pte_same(pte, expected_pte))
+                               break;
+               }

                if (any_shared)
                        *any_shared |= !pte_swp_exclusive(pte);

> softdirty and uffd-wp as well at least initially for simplicity.

yes for this.

By the way, I wonder if you and Ryan have a moment to review swpin
refault patchset
v2 :-)

[PATCH v2 0/5] large folios swap-in: handle refault cases first
https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/


>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Ryan Roberts April 9, 2024, 9:55 a.m. UTC | #6
On 09/04/2024 10:41, Barry Song wrote:
> On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 09.04.24 11:22, Barry Song wrote:
>>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>
>>>>> Now that we no longer have a convenient flag in the cluster to determine
>>>>> if a folio is large, free_swap_and_cache() will take a reference and
>>>>> lock a large folio much more often, which could lead to contention and
>>>>> (e.g.) failure to split large folios, etc.
>>>>>
>>>>> Let's solve that problem by batch freeing swap and cache with a new
>>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>>>> entries together. This allows us to first drop a reference to each swap
>>>>> slot before we try to release the cache folio. This means we only try to
>>>>> release the folio once, only taking the reference and lock once - much
>>>>> better than the previous 512 times for the 2M THP case.
>>>>>
>>>>> Contiguous swap entries are gathered in zap_pte_range() and
>>>>> madvise_free_pte_range() in a similar way to how present ptes are
>>>>> already gathered in zap_pte_range().
>>>>>
>>>>> While we are at it, let's simplify by converting the return type of both
>>>>> functions to void. The return value was used only by zap_pte_range() to
>>>>> print a bad pte, and was ignored by everyone else, so the extra
>>>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>>>> most of the information from get_swap_device(). With the batch version,
>>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>>>
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>> ---
>>>>>   include/linux/pgtable.h | 29 ++++++++++++
>>>>>   include/linux/swap.h    | 12 +++--
>>>>>   mm/internal.h           | 63 ++++++++++++++++++++++++++
>>>>>   mm/madvise.c            | 12 +++--
>>>>>   mm/memory.c             | 13 +++---
>>>>>   mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
>>>>>   6 files changed, 195 insertions(+), 31 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>> index a3fc8150b047..75096025fe52 100644
>>>>> --- a/include/linux/pgtable.h
>>>>> +++ b/include/linux/pgtable.h
>>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>>>>   }
>>>>>   #endif
>>>>>
>>>>> +#ifndef clear_not_present_full_ptes
>>>>> +/**
>>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>>>> + *                              consecutive in the pgtable.
>>>>> + * @mm: Address space the ptes represent.
>>>>> + * @addr: Address of the first pte.
>>>>> + * @ptep: Page table pointer for the first entry.
>>>>> + * @nr: Number of entries to clear.
>>>>> + * @full: Whether we are clearing a full mm.
>>>>> + *
>>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>>>> + * loop over pte_clear_not_present_full().
>>>>> + *
>>>>> + * Context: The caller holds the page table lock.  The PTEs are all not present.
>>>>> + * The PTEs are all in the same PMD.
>>>>> + */
>>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>>>> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>>>> +{
>>>>> +       for (;;) {
>>>>> +               pte_clear_not_present_full(mm, addr, ptep, full);
>>>>> +               if (--nr == 0)
>>>>> +                       break;
>>>>> +               ptep++;
>>>>> +               addr += PAGE_SIZE;
>>>>> +       }
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>   #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>>>>   extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>>>>                                unsigned long address,
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index f6f78198f000..5737236dc3ce 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>>>>   extern int swapcache_prepare(swp_entry_t);
>>>>>   extern void swap_free(swp_entry_t);
>>>>>   extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>>>> -extern int free_swap_and_cache(swp_entry_t);
>>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>>>>   int swap_type_of(dev_t device, sector_t offset);
>>>>>   int find_first_swap(dev_t *device);
>>>>>   extern unsigned int count_swap_pages(int, int);
>>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>>>>   #define free_pages_and_swap_cache(pages, nr) \
>>>>>          release_pages((pages), (nr));
>>>>>
>>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>> +{
>>>>> +}
>>>>>
>>>>>   static inline void free_swap_cache(struct folio *folio)
>>>>>   {
>>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>>>>   }
>>>>>   #endif /* CONFIG_SWAP */
>>>>>
>>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>>>> +{
>>>>> +       free_swap_and_cache_nr(entry, 1);
>>>>> +}
>>>>> +
>>>>>   #ifdef CONFIG_MEMCG
>>>>>   static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>>>>   {
>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>> index 3bdc8693b54f..de68705624b0 100644
>>>>> --- a/mm/internal.h
>>>>> +++ b/mm/internal.h
>>>>> @@ -11,6 +11,8 @@
>>>>>   #include <linux/mm.h>
>>>>>   #include <linux/pagemap.h>
>>>>>   #include <linux/rmap.h>
>>>>> +#include <linux/swap.h>
>>>>> +#include <linux/swapops.h>
>>>>>   #include <linux/tracepoint-defs.h>
>>>>>
>>>>>   struct folio_batch;
>>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>>>
>>>>>          return min(ptep - start_ptep, max_nr);
>>>>>   }
>>>>> +
>>>>> +/**
>>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>>>> + *
>>>>> + * Increments the swap offset, while maintaining all other fields, including
>>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>>>> + */
>>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>>>> +{
>>>>> +       swp_entry_t entry = pte_to_swp_entry(pte);
>>>>> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>>>> +                                                  swp_offset(entry) + 1));
>>>>> +
>>>>> +       if (pte_swp_soft_dirty(pte))
>>>>> +               new = pte_swp_mksoft_dirty(new);
>>>>> +       if (pte_swp_exclusive(pte))
>>>>> +               new = pte_swp_mkexclusive(new);
>>>>> +       if (pte_swp_uffd_wp(pte))
>>>>> +               new = pte_swp_mkuffd_wp(new);
>>>>
>>>> I don't quite understand this. If this page table entry is exclusive,
>>>> will its subsequent page table entry also be exclusive without
>>>> question?
>>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>>>
>>>>                  anon_exclusive = folio_test_anon(folio) &&
>>>>                                   PageAnonExclusive(subpage);
>>>>
>>>> same questions also for diry, wp etc.
>>>
>>> Sorry for the noise. you are right. based on your new version, I think I should
>>> entirely drop:
>>>
>>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
>>> all swap entries are exclusive
>>
>> Yes. If we ever want to ignore some bits, we should likely add flags to
>> change the behavior, like for folio_pte_batch().
>>
>> For swapin, you really want the exclusive bits to match, though.
> 
> I am not quite sure I definitely need exclusive bits to match. i can either
> drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
> swpin won't reuse the large folio, but it can still entirely map it read-only):
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index cae39c372bfc..5726e729c9ee 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
> *start_ptep, int max_nr, pte_t pte,
>                 *any_shared |= !pte_swp_exclusive(pte);
> 
>         while (ptep < end_ptep) {
> +               pte_t ignore_exclusive_pte;
> +               pte_t ignore_exclusive_expected_pte;
>                 pte = ptep_get(ptep);
> 
> -               if (!pte_same(pte, expected_pte))
> -                       break;
> +               if (any_shared) {
> +                       ignore_exclusive_pte = pte;
> +                       ignore_exclusive_expected_pte = expected_pte;
> +                       ignore_exclusive_pte =
> pte_swp_clear_exclusive(ignore_exclusive_pte);
> +                       ignore_exclusive_expected_pte =
> pte_swp_clear_exclusive(expected_pte);
> +
> +                       if (!pte_same(ignore_exclusive_pte,
> ignore_exclusive_expected_pte))
> +                               break;
> +               } else {
> +                       if (!pte_same(pte, expected_pte))
> +                               break;
> +               }
> 
>                 if (any_shared)
>                         *any_shared |= !pte_swp_exclusive(pte);

I'll leave David to comment on this proposal; I'm not sure I understand all the
details. The code change does look a bit "busy" though - sometimes that can be
an indicator :)

> 
>> softdirty and uffd-wp as well at least initially for simplicity.
> 
> yes for this.
> 
> By the way, I wonder if you and Ryan have a moment to review swpin
> refault patchset
> v2 :-)

It's on my todo list! I'm very keen to get as much large swap-out and swap-in
support into v6.10 as we can. Hoping to get to it inthe next couple of days.

> 
> [PATCH v2 0/5] large folios swap-in: handle refault cases first
> https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/
> 
> 
>>
>> --
>> Cheers,
>>
>> David / dhildenb
>>
> 
> Thanks
> Barry
Barry Song April 9, 2024, 10:29 a.m. UTC | #7
On Tue, Apr 9, 2024 at 9:55 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 09/04/2024 10:41, Barry Song wrote:
> > On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 09.04.24 11:22, Barry Song wrote:
> >>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
> >>>>
> >>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>
> >>>>> Now that we no longer have a convenient flag in the cluster to determine
> >>>>> if a folio is large, free_swap_and_cache() will take a reference and
> >>>>> lock a large folio much more often, which could lead to contention and
> >>>>> (e.g.) failure to split large folios, etc.
> >>>>>
> >>>>> Let's solve that problem by batch freeing swap and cache with a new
> >>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> >>>>> entries together. This allows us to first drop a reference to each swap
> >>>>> slot before we try to release the cache folio. This means we only try to
> >>>>> release the folio once, only taking the reference and lock once - much
> >>>>> better than the previous 512 times for the 2M THP case.
> >>>>>
> >>>>> Contiguous swap entries are gathered in zap_pte_range() and
> >>>>> madvise_free_pte_range() in a similar way to how present ptes are
> >>>>> already gathered in zap_pte_range().
> >>>>>
> >>>>> While we are at it, let's simplify by converting the return type of both
> >>>>> functions to void. The return value was used only by zap_pte_range() to
> >>>>> print a bad pte, and was ignored by everyone else, so the extra
> >>>>> reporting wasn't exactly guaranteed. We will still get the warning with
> >>>>> most of the information from get_swap_device(). With the batch version,
> >>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
> >>>>>
> >>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>>>> ---
> >>>>>   include/linux/pgtable.h | 29 ++++++++++++
> >>>>>   include/linux/swap.h    | 12 +++--
> >>>>>   mm/internal.h           | 63 ++++++++++++++++++++++++++
> >>>>>   mm/madvise.c            | 12 +++--
> >>>>>   mm/memory.c             | 13 +++---
> >>>>>   mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
> >>>>>   6 files changed, 195 insertions(+), 31 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> >>>>> index a3fc8150b047..75096025fe52 100644
> >>>>> --- a/include/linux/pgtable.h
> >>>>> +++ b/include/linux/pgtable.h
> >>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> >>>>>   }
> >>>>>   #endif
> >>>>>
> >>>>> +#ifndef clear_not_present_full_ptes
> >>>>> +/**
> >>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> >>>>> + *                              consecutive in the pgtable.
> >>>>> + * @mm: Address space the ptes represent.
> >>>>> + * @addr: Address of the first pte.
> >>>>> + * @ptep: Page table pointer for the first entry.
> >>>>> + * @nr: Number of entries to clear.
> >>>>> + * @full: Whether we are clearing a full mm.
> >>>>> + *
> >>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
> >>>>> + * loop over pte_clear_not_present_full().
> >>>>> + *
> >>>>> + * Context: The caller holds the page table lock.  The PTEs are all not present.
> >>>>> + * The PTEs are all in the same PMD.
> >>>>> + */
> >>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> >>>>> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> >>>>> +{
> >>>>> +       for (;;) {
> >>>>> +               pte_clear_not_present_full(mm, addr, ptep, full);
> >>>>> +               if (--nr == 0)
> >>>>> +                       break;
> >>>>> +               ptep++;
> >>>>> +               addr += PAGE_SIZE;
> >>>>> +       }
> >>>>> +}
> >>>>> +#endif
> >>>>> +
> >>>>>   #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> >>>>>   extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> >>>>>                                unsigned long address,
> >>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>>>> index f6f78198f000..5737236dc3ce 100644
> >>>>> --- a/include/linux/swap.h
> >>>>> +++ b/include/linux/swap.h
> >>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> >>>>>   extern int swapcache_prepare(swp_entry_t);
> >>>>>   extern void swap_free(swp_entry_t);
> >>>>>   extern void swapcache_free_entries(swp_entry_t *entries, int n);
> >>>>> -extern int free_swap_and_cache(swp_entry_t);
> >>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> >>>>>   int swap_type_of(dev_t device, sector_t offset);
> >>>>>   int find_first_swap(dev_t *device);
> >>>>>   extern unsigned int count_swap_pages(int, int);
> >>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> >>>>>   #define free_pages_and_swap_cache(pages, nr) \
> >>>>>          release_pages((pages), (nr));
> >>>>>
> >>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> >>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> >>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>>>> +{
> >>>>> +}
> >>>>>
> >>>>>   static inline void free_swap_cache(struct folio *folio)
> >>>>>   {
> >>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> >>>>>   }
> >>>>>   #endif /* CONFIG_SWAP */
> >>>>>
> >>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
> >>>>> +{
> >>>>> +       free_swap_and_cache_nr(entry, 1);
> >>>>> +}
> >>>>> +
> >>>>>   #ifdef CONFIG_MEMCG
> >>>>>   static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> >>>>>   {
> >>>>> diff --git a/mm/internal.h b/mm/internal.h
> >>>>> index 3bdc8693b54f..de68705624b0 100644
> >>>>> --- a/mm/internal.h
> >>>>> +++ b/mm/internal.h
> >>>>> @@ -11,6 +11,8 @@
> >>>>>   #include <linux/mm.h>
> >>>>>   #include <linux/pagemap.h>
> >>>>>   #include <linux/rmap.h>
> >>>>> +#include <linux/swap.h>
> >>>>> +#include <linux/swapops.h>
> >>>>>   #include <linux/tracepoint-defs.h>
> >>>>>
> >>>>>   struct folio_batch;
> >>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >>>>>
> >>>>>          return min(ptep - start_ptep, max_nr);
> >>>>>   }
> >>>>> +
> >>>>> +/**
> >>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> >>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> >>>>> + *
> >>>>> + * Increments the swap offset, while maintaining all other fields, including
> >>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
> >>>>> + */
> >>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
> >>>>> +{
> >>>>> +       swp_entry_t entry = pte_to_swp_entry(pte);
> >>>>> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> >>>>> +                                                  swp_offset(entry) + 1));
> >>>>> +
> >>>>> +       if (pte_swp_soft_dirty(pte))
> >>>>> +               new = pte_swp_mksoft_dirty(new);
> >>>>> +       if (pte_swp_exclusive(pte))
> >>>>> +               new = pte_swp_mkexclusive(new);
> >>>>> +       if (pte_swp_uffd_wp(pte))
> >>>>> +               new = pte_swp_mkuffd_wp(new);
> >>>>
> >>>> I don't quite understand this. If this page table entry is exclusive,
> >>>> will its subsequent page table entry also be exclusive without
> >>>> question?
> >>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
> >>>>
> >>>>                  anon_exclusive = folio_test_anon(folio) &&
> >>>>                                   PageAnonExclusive(subpage);
> >>>>
> >>>> same questions also for diry, wp etc.
> >>>
> >>> Sorry for the noise. you are right. based on your new version, I think I should
> >>> entirely drop:
> >>>
> >>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> >>> all swap entries are exclusive
> >>
> >> Yes. If we ever want to ignore some bits, we should likely add flags to
> >> change the behavior, like for folio_pte_batch().
> >>
> >> For swapin, you really want the exclusive bits to match, though.
> >
> > I am not quite sure I definitely need exclusive bits to match. i can either
> > drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
> > swpin won't reuse the large folio, but it can still entirely map it read-only):
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index cae39c372bfc..5726e729c9ee 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
> > *start_ptep, int max_nr, pte_t pte,
> >                 *any_shared |= !pte_swp_exclusive(pte);
> >
> >         while (ptep < end_ptep) {
> > +               pte_t ignore_exclusive_pte;
> > +               pte_t ignore_exclusive_expected_pte;
> >                 pte = ptep_get(ptep);
> >
> > -               if (!pte_same(pte, expected_pte))
> > -                       break;
> > +               if (any_shared) {
> > +                       ignore_exclusive_pte = pte;
> > +                       ignore_exclusive_expected_pte = expected_pte;
> > +                       ignore_exclusive_pte =
> > pte_swp_clear_exclusive(ignore_exclusive_pte);
> > +                       ignore_exclusive_expected_pte =
> > pte_swp_clear_exclusive(expected_pte);
> > +
> > +                       if (!pte_same(ignore_exclusive_pte,
> > ignore_exclusive_expected_pte))
> > +                               break;
> > +               } else {
> > +                       if (!pte_same(pte, expected_pte))
> > +                               break;
> > +               }
> >
> >                 if (any_shared)
> >                         *any_shared |= !pte_swp_exclusive(pte);
>
> I'll leave David to comment on this proposal; I'm not sure I understand all the
> details. The code change does look a bit "busy" though - sometimes that can be
> an indicator :)

indeed. I wrote it in one minute.

I'm confident that the code can be written in a manner similar to
__pte_batch_clear_ignored. I was only proposing the approach,
not selling the code :-)

>
> >
> >> softdirty and uffd-wp as well at least initially for simplicity.
> >
> > yes for this.
> >
> > By the way, I wonder if you and Ryan have a moment to review swpin
> > refault patchset
> > v2 :-)
>
> It's on my todo list! I'm very keen to get as much large swap-out and swap-in
> support into v6.10 as we can. Hoping to get to it inthe next couple of days.
>
> >
> > [PATCH v2 0/5] large folios swap-in: handle refault cases first
> > https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/
> >
> >
> >>
> >> --
> >> Cheers,
> >>
> >> David / dhildenb
> >>
> >

Thanks
Barry
>
David Hildenbrand April 9, 2024, 10:42 a.m. UTC | #8
On 09.04.24 11:55, Ryan Roberts wrote:
> On 09/04/2024 10:41, Barry Song wrote:
>> On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 09.04.24 11:22, Barry Song wrote:
>>>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>>
>>>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> Now that we no longer have a convenient flag in the cluster to determine
>>>>>> if a folio is large, free_swap_and_cache() will take a reference and
>>>>>> lock a large folio much more often, which could lead to contention and
>>>>>> (e.g.) failure to split large folios, etc.
>>>>>>
>>>>>> Let's solve that problem by batch freeing swap and cache with a new
>>>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>>>>> entries together. This allows us to first drop a reference to each swap
>>>>>> slot before we try to release the cache folio. This means we only try to
>>>>>> release the folio once, only taking the reference and lock once - much
>>>>>> better than the previous 512 times for the 2M THP case.
>>>>>>
>>>>>> Contiguous swap entries are gathered in zap_pte_range() and
>>>>>> madvise_free_pte_range() in a similar way to how present ptes are
>>>>>> already gathered in zap_pte_range().
>>>>>>
>>>>>> While we are at it, let's simplify by converting the return type of both
>>>>>> functions to void. The return value was used only by zap_pte_range() to
>>>>>> print a bad pte, and was ignored by everyone else, so the extra
>>>>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>>>>> most of the information from get_swap_device(). With the batch version,
>>>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>>>>
>>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>>> ---
>>>>>>    include/linux/pgtable.h | 29 ++++++++++++
>>>>>>    include/linux/swap.h    | 12 +++--
>>>>>>    mm/internal.h           | 63 ++++++++++++++++++++++++++
>>>>>>    mm/madvise.c            | 12 +++--
>>>>>>    mm/memory.c             | 13 +++---
>>>>>>    mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
>>>>>>    6 files changed, 195 insertions(+), 31 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>>> index a3fc8150b047..75096025fe52 100644
>>>>>> --- a/include/linux/pgtable.h
>>>>>> +++ b/include/linux/pgtable.h
>>>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>>>>>    }
>>>>>>    #endif
>>>>>>
>>>>>> +#ifndef clear_not_present_full_ptes
>>>>>> +/**
>>>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>>>>> + *                              consecutive in the pgtable.
>>>>>> + * @mm: Address space the ptes represent.
>>>>>> + * @addr: Address of the first pte.
>>>>>> + * @ptep: Page table pointer for the first entry.
>>>>>> + * @nr: Number of entries to clear.
>>>>>> + * @full: Whether we are clearing a full mm.
>>>>>> + *
>>>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>>>>> + * loop over pte_clear_not_present_full().
>>>>>> + *
>>>>>> + * Context: The caller holds the page table lock.  The PTEs are all not present.
>>>>>> + * The PTEs are all in the same PMD.
>>>>>> + */
>>>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>>>>> +               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>>>>> +{
>>>>>> +       for (;;) {
>>>>>> +               pte_clear_not_present_full(mm, addr, ptep, full);
>>>>>> +               if (--nr == 0)
>>>>>> +                       break;
>>>>>> +               ptep++;
>>>>>> +               addr += PAGE_SIZE;
>>>>>> +       }
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>>    #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>>>>>    extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>>>>>                                 unsigned long address,
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index f6f78198f000..5737236dc3ce 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>>>>>    extern int swapcache_prepare(swp_entry_t);
>>>>>>    extern void swap_free(swp_entry_t);
>>>>>>    extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>>>>> -extern int free_swap_and_cache(swp_entry_t);
>>>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>>>>>    int swap_type_of(dev_t device, sector_t offset);
>>>>>>    int find_first_swap(dev_t *device);
>>>>>>    extern unsigned int count_swap_pages(int, int);
>>>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>>>>>    #define free_pages_and_swap_cache(pages, nr) \
>>>>>>           release_pages((pages), (nr));
>>>>>>
>>>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>>> +{
>>>>>> +}
>>>>>>
>>>>>>    static inline void free_swap_cache(struct folio *folio)
>>>>>>    {
>>>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>>>>>    }
>>>>>>    #endif /* CONFIG_SWAP */
>>>>>>
>>>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>>>>> +{
>>>>>> +       free_swap_and_cache_nr(entry, 1);
>>>>>> +}
>>>>>> +
>>>>>>    #ifdef CONFIG_MEMCG
>>>>>>    static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>>>>>    {
>>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>>> index 3bdc8693b54f..de68705624b0 100644
>>>>>> --- a/mm/internal.h
>>>>>> +++ b/mm/internal.h
>>>>>> @@ -11,6 +11,8 @@
>>>>>>    #include <linux/mm.h>
>>>>>>    #include <linux/pagemap.h>
>>>>>>    #include <linux/rmap.h>
>>>>>> +#include <linux/swap.h>
>>>>>> +#include <linux/swapops.h>
>>>>>>    #include <linux/tracepoint-defs.h>
>>>>>>
>>>>>>    struct folio_batch;
>>>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>>>>
>>>>>>           return min(ptep - start_ptep, max_nr);
>>>>>>    }
>>>>>> +
>>>>>> +/**
>>>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>>>>> + *
>>>>>> + * Increments the swap offset, while maintaining all other fields, including
>>>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>>>>> + */
>>>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>>>>> +{
>>>>>> +       swp_entry_t entry = pte_to_swp_entry(pte);
>>>>>> +       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>>>>> +                                                  swp_offset(entry) + 1));
>>>>>> +
>>>>>> +       if (pte_swp_soft_dirty(pte))
>>>>>> +               new = pte_swp_mksoft_dirty(new);
>>>>>> +       if (pte_swp_exclusive(pte))
>>>>>> +               new = pte_swp_mkexclusive(new);
>>>>>> +       if (pte_swp_uffd_wp(pte))
>>>>>> +               new = pte_swp_mkuffd_wp(new);
>>>>>
>>>>> I don't quite understand this. If this page table entry is exclusive,
>>>>> will its subsequent page table entry also be exclusive without
>>>>> question?
>>>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>>>>
>>>>>                   anon_exclusive = folio_test_anon(folio) &&
>>>>>                                    PageAnonExclusive(subpage);
>>>>>
>>>>> same questions also for diry, wp etc.
>>>>
>>>> Sorry for the noise. you are right. based on your new version, I think I should
>>>> entirely drop:
>>>>
>>>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
>>>> all swap entries are exclusive
>>>
>>> Yes. If we ever want to ignore some bits, we should likely add flags to
>>> change the behavior, like for folio_pte_batch().
>>>
>>> For swapin, you really want the exclusive bits to match, though.
>>
>> I am not quite sure I definitely need exclusive bits to match. i can either
>> drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
>> swpin won't reuse the large folio, but it can still entirely map it read-only):
>>
>> diff --git a/mm/internal.h b/mm/internal.h
>> index cae39c372bfc..5726e729c9ee 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
>> *start_ptep, int max_nr, pte_t pte,
>>                  *any_shared |= !pte_swp_exclusive(pte);
>>
>>          while (ptep < end_ptep) {
>> +               pte_t ignore_exclusive_pte;
>> +               pte_t ignore_exclusive_expected_pte;
>>                  pte = ptep_get(ptep);
>>
>> -               if (!pte_same(pte, expected_pte))
>> -                       break;
>> +               if (any_shared) {
>> +                       ignore_exclusive_pte = pte;
>> +                       ignore_exclusive_expected_pte = expected_pte;
>> +                       ignore_exclusive_pte =
>> pte_swp_clear_exclusive(ignore_exclusive_pte);
>> +                       ignore_exclusive_expected_pte =
>> pte_swp_clear_exclusive(expected_pte);
>> +
>> +                       if (!pte_same(ignore_exclusive_pte,
>> ignore_exclusive_expected_pte))
>> +                               break;
>> +               } else {
>> +                       if (!pte_same(pte, expected_pte))
>> +                               break;
>> +               }
>>
>>                  if (any_shared)
>>                          *any_shared |= !pte_swp_exclusive(pte);
> 
> I'll leave David to comment on this proposal; I'm not sure I understand all the
> details. The code change does look a bit "busy" though - sometimes that can be
> an indicator :)

I'd prefer to keep it simple initially. Devil is in the detail for the 
refault case: in the past, dropping an exclusive flag could have been 
problematic with some O_DIRECT workloads that were not using FOLL_PIN. 
IIUC, some still remain.

More details can be had from 
https://lore.kernel.org/all/20230113171026.582290-1-david@redhat.com/


It might all change a bit if I manage to get folio_test_anon_exclusive() 
flying. The current plan is that all individual swp PTEs would still 
have pte_swp_exclusive() set, and we would *not* clear the 
folio_test_anon_exclusive() flag while the folio is still in the 
swapcache [which we do today to make fork() handling easier].

That will make refault significantly easier to handle regarding 
exclusivity handling with large folios.
diff mbox series

Patch

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a3fc8150b047..75096025fe52 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -708,6 +708,35 @@  static inline void pte_clear_not_present_full(struct mm_struct *mm,
 }
 #endif
 
+#ifndef clear_not_present_full_ptes
+/**
+ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
+ *				 consecutive in the pgtable.
+ * @mm: Address space the ptes represent.
+ * @addr: Address of the first pte.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over pte_clear_not_present_full().
+ *
+ * Context: The caller holds the page table lock.  The PTEs are all not present.
+ * The PTEs are all in the same PMD.
+ */
+static inline void clear_not_present_full_ptes(struct mm_struct *mm,
+		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		pte_clear_not_present_full(mm, addr, ptep, full);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
 extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
 			      unsigned long address,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6f78198f000..5737236dc3ce 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -471,7 +471,7 @@  extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
@@ -520,8 +520,9 @@  static inline void put_swap_device(struct swap_info_struct *si)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr));
 
-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+{
+}
 
 static inline void free_swap_cache(struct folio *folio)
 {
@@ -589,6 +590,11 @@  static inline int add_swap_extent(struct swap_info_struct *sis,
 }
 #endif /* CONFIG_SWAP */
 
+static inline void free_swap_and_cache(swp_entry_t entry)
+{
+	free_swap_and_cache_nr(entry, 1);
+}
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 3bdc8693b54f..de68705624b0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@ 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/tracepoint-defs.h>
 
 struct folio_batch;
@@ -189,6 +191,67 @@  static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 
 	return min(ptep - start_ptep, max_nr);
 }
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+	swp_entry_t entry = pte_to_swp_entry(pte);
+	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+						   swp_offset(entry) + 1));
+
+	if (pte_swp_soft_dirty(pte))
+		new = pte_swp_mksoft_dirty(new);
+	if (pte_swp_exclusive(pte))
+		new = pte_swp_mkexclusive(new);
+	if (pte_swp_uffd_wp(pte))
+		new = pte_swp_mkuffd_wp(new);
+
+	return new;
+}
+
+/**
+ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
+ * @start_ptep: Page table pointer for the first entry.
+ * @max_nr: The maximum number of table entries to consider.
+ * @pte: Page table entry for the first entry.
+ *
+ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
+ * containing swap entries all with consecutive offsets and targeting the same
+ * swap type, all with matching swp pte bits.
+ *
+ * max_nr must be at least one and must be limited by the caller so scanning
+ * cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+{
+	pte_t expected_pte = pte_next_swp_offset(pte);
+	const pte_t *end_ptep = start_ptep + max_nr;
+	pte_t *ptep = start_ptep + 1;
+
+	VM_WARN_ON(max_nr < 1);
+	VM_WARN_ON(!is_swap_pte(pte));
+	VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+
+	while (ptep < end_ptep) {
+		pte = ptep_get(ptep);
+
+		if (!pte_same(pte, expected_pte))
+			break;
+
+		expected_pte = pte_next_swp_offset(expected_pte);
+		ptep++;
+	}
+
+	return ptep - start_ptep;
+}
 #endif /* CONFIG_MMU */
 
 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f77a51baaac..5011ecb24344 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -628,6 +628,7 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	struct folio *folio;
 	int nr_swap = 0;
 	unsigned long next;
+	int nr, max_nr;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd))
@@ -640,7 +641,8 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		return 0;
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
+	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+		nr = 1;
 		ptent = ptep_get(pte);
 
 		if (pte_none(ptent))
@@ -655,9 +657,11 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
 			entry = pte_to_swp_entry(ptent);
 			if (!non_swap_entry(entry)) {
-				nr_swap--;
-				free_swap_and_cache(entry);
-				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+				max_nr = (end - addr) / PAGE_SIZE;
+				nr = swap_pte_batch(pte, max_nr, ptent);
+				nr_swap -= nr;
+				free_swap_and_cache_nr(entry, nr);
+				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
 			} else if (is_hwpoison_entry(entry) ||
 				   is_poisoned_swp_entry(entry)) {
 				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
diff --git a/mm/memory.c b/mm/memory.c
index b98e4d907a14..0db2aa066a5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1637,12 +1637,13 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				folio_remove_rmap_pte(folio, page, vma);
 			folio_put(folio);
 		} else if (!non_swap_entry(entry)) {
-			/* Genuine swap entry, hence a private anon page */
+			max_nr = (end - addr) / PAGE_SIZE;
+			nr = swap_pte_batch(pte, max_nr, ptent);
+			/* Genuine swap entries, hence a private anon pages */
 			if (!should_zap_cows(details))
 				continue;
-			rss[MM_SWAPENTS]--;
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			rss[MM_SWAPENTS] -= nr;
+			free_swap_and_cache_nr(entry, nr);
 		} else if (is_migration_entry(entry)) {
 			folio = pfn_swap_entry_folio(entry);
 			if (!should_zap_folio(details, folio))
@@ -1665,8 +1666,8 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
 			WARN_ON_ONCE(1);
 		}
-		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+		clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
 	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
 
 	add_mm_rss_vec(mm, rss);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1ded6d1dcab4..20c45757f2b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -130,7 +130,11 @@  static inline unsigned char swap_count(unsigned char ent)
 /* Reclaim the swap entry if swap is getting full*/
 #define TTRS_FULL		0x4
 
-/* returns 1 if swap entry is freed */
+/*
+ * returns number of pages in the folio that backs the swap entry. If positive,
+ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
+ * folio was associated with the swap entry.
+ */
 static int __try_to_reclaim_swap(struct swap_info_struct *si,
 				 unsigned long offset, unsigned long flags)
 {
@@ -155,6 +159,7 @@  static int __try_to_reclaim_swap(struct swap_info_struct *si,
 			ret = folio_free_swap(folio);
 		folio_unlock(folio);
 	}
+	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
 	folio_put(folio);
 	return ret;
 }
@@ -895,7 +900,7 @@  static int scan_swap_map_slots(struct swap_info_struct *si,
 		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
 		spin_lock(&si->lock);
 		/* entry was freed successfully, try to use this again */
-		if (swap_was_freed)
+		if (swap_was_freed > 0)
 			goto checks;
 		goto scan; /* check next one */
 	}
@@ -1572,32 +1577,88 @@  bool folio_free_swap(struct folio *folio)
 	return true;
 }
 
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
+/**
+ * free_swap_and_cache_nr() - Release reference on range of swap entries and
+ *                            reclaim their cache if no more references remain.
+ * @entry: First entry of range.
+ * @nr: Number of entries in range.
+ *
+ * For each swap entry in the contiguous range, release a reference. If any swap
+ * entries become free, try to reclaim their underlying folios, if present. The
+ * offset range is defined by [entry.offset, entry.offset + nr).
  */
-int free_swap_and_cache(swp_entry_t entry)
+void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 {
-	struct swap_info_struct *p;
+	const unsigned long start_offset = swp_offset(entry);
+	const unsigned long end_offset = start_offset + nr;
+	unsigned int type = swp_type(entry);
+	struct swap_info_struct *si;
+	bool any_only_cache = false;
+	unsigned long offset;
 	unsigned char count;
 
 	if (non_swap_entry(entry))
-		return 1;
+		return;
 
-	p = get_swap_device(entry);
-	if (p) {
-		if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
-			put_swap_device(p);
-			return 0;
+	si = get_swap_device(entry);
+	if (!si)
+		return;
+
+	if (WARN_ON(end_offset > si->max))
+		goto out;
+
+	/*
+	 * First free all entries in the range.
+	 */
+	for (offset = start_offset; offset < end_offset; offset++) {
+		if (data_race(si->swap_map[offset])) {
+			count = __swap_entry_free(si, swp_entry(type, offset));
+			if (count == SWAP_HAS_CACHE)
+				any_only_cache = true;
+		} else {
+			WARN_ON_ONCE(1);
 		}
+	}
+
+	/*
+	 * Short-circuit the below loop if none of the entries had their
+	 * reference drop to zero.
+	 */
+	if (!any_only_cache)
+		goto out;
 
-		count = __swap_entry_free(p, entry);
-		if (count == SWAP_HAS_CACHE)
-			__try_to_reclaim_swap(p, swp_offset(entry),
+	/*
+	 * Now go back over the range trying to reclaim the swap cache. This is
+	 * more efficient for large folios because we will only try to reclaim
+	 * the swap once per folio in the common case. If we do
+	 * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
+	 * latter will get a reference and lock the folio for every individual
+	 * page but will only succeed once the swap slot for every subpage is
+	 * zero.
+	 */
+	for (offset = start_offset; offset < end_offset; offset += nr) {
+		nr = 1;
+		if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+			/*
+			 * Folios are always naturally aligned in swap so
+			 * advance forward to the next boundary. Zero means no
+			 * folio was found for the swap entry, so advance by 1
+			 * in this case. Negative value means folio was found
+			 * but could not be reclaimed. Here we can still advance
+			 * to the next boundary.
+			 */
+			nr = __try_to_reclaim_swap(si, offset,
 					      TTRS_UNMAPPED | TTRS_FULL);
-		put_swap_device(p);
+			if (nr == 0)
+				nr = 1;
+			else if (nr < 0)
+				nr = -nr;
+			nr = ALIGN(offset + 1, nr) - offset;
+		}
 	}
-	return p != NULL;
+
+out:
+	put_swap_device(si);
 }
 
 #ifdef CONFIG_HIBERNATION