diff mbox series

[1/2] mm: swap: swap cluster switch to double link list

Message ID 20240524-swap-allocator-v1-1-47861b423b26@kernel.org (mailing list archive)
State New
Headers show
Series mm: swap: mTHP swap allocator base on swap cluster order | expand

Commit Message

Chris Li May 24, 2024, 5:17 p.m. UTC
Previously, the swap cluster used a cluster index as a pointer
to construct a custom single link list type "swap_cluster_list".
The next cluster pointer is shared with the cluster->count.
The assumption is that only the free cluster needs to be put
on the list.

That assumption is not true for mTHP allocators any more. Need
to track the non full cluster on the list as well.  Move the
current cluster single link list into standard double link list.

Remove the cluster getter/setter for accessing the cluster
struct member.  Move the cluster locking in the caller function
rather than the getter/setter function. That way the locking can
protect more than one member, e.g. cluster->flag.

Change cluster code to use "struct swap_cluster_info *" to
reference the cluster rather than by using index. That is more
consistent with the list manipulation. It avoids the repeat
adding index to the cluser_info. The code is easier to understand.

Remove the cluster next pointer is NULL flag, the double link
list can handle the empty list pretty well.

The "swap_cluster_info" struct is two pointer bigger, because
512 swap entries share one swap struct, it has very little impact
on the average memory usage per swap entry.  Other than the list
conversion, there is no real function change in this patch.
---
 include/linux/swap.h |  14 ++--
 mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
 2 files changed, 68 insertions(+), 177 deletions(-)

Comments

Kairui Song May 28, 2024, 4:23 p.m. UTC | #1
On Sat, May 25, 2024 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
>
> Previously, the swap cluster used a cluster index as a pointer
> to construct a custom single link list type "swap_cluster_list".
> The next cluster pointer is shared with the cluster->count.
> The assumption is that only the free cluster needs to be put
> on the list.
>
> That assumption is not true for mTHP allocators any more. Need
> to track the non full cluster on the list as well.  Move the
> current cluster single link list into standard double link list.
>
> Remove the cluster getter/setter for accessing the cluster
> struct member.  Move the cluster locking in the caller function
> rather than the getter/setter function. That way the locking can
> protect more than one member, e.g. cluster->flag.
>
> Change cluster code to use "struct swap_cluster_info *" to
> reference the cluster rather than by using index. That is more
> consistent with the list manipulation. It avoids the repeat
> adding index to the cluser_info. The code is easier to understand.
>
> Remove the cluster next pointer is NULL flag, the double link
> list can handle the empty list pretty well.
>
> The "swap_cluster_info" struct is two pointer bigger, because
> 512 swap entries share one swap struct, it has very little impact
> on the average memory usage per swap entry.  Other than the list
> conversion, there is no real function change in this patch.
> ---
>  include/linux/swap.h |  14 ++--
>  mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
>  2 files changed, 68 insertions(+), 177 deletions(-)
>

Hi Chris,

Thanks for this very nice clean up, the code is much easier to read.

> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 11c53692f65f..0d3906eff3c9 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -254,11 +254,12 @@ struct swap_cluster_info {
>                                  * elements correspond to the swap
>                                  * cluster
>                                  */
> -       unsigned int data:24;
> +       unsigned int count:16;
>         unsigned int flags:8;
> +       struct list_head next;
>  };
>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> +
>
>  /*
>   * The first page in the swap file is the swap header, which is always marked
> @@ -283,11 +284,6 @@ struct percpu_cluster {
>         unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>  };
>
> -struct swap_cluster_list {
> -       struct swap_cluster_info head;
> -       struct swap_cluster_info tail;
> -};
> -
>  /*
>   * The in-memory structure used to track swap areas.
>   */
> @@ -300,7 +296,7 @@ struct swap_info_struct {
>         unsigned int    max;            /* extent of the swap_map */
>         unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
>         struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> -       struct swap_cluster_list free_clusters; /* free clusters list */
> +       struct list_head free_clusters; /* free clusters list */
>         unsigned int lowest_bit;        /* index of first free in swap_map */
>         unsigned int highest_bit;       /* index of last free in swap_map */
>         unsigned int pages;             /* total of usable pages of swap */
> @@ -333,7 +329,7 @@ struct swap_info_struct {
>                                          * list.
>                                          */
>         struct work_struct discard_work; /* discard worker */
> -       struct swap_cluster_list discard_clusters; /* discard clusters list */
> +       struct list_head discard_clusters; /* discard clusters list */
>         struct plist_node avail_lists[]; /*
>                                            * entries in swap_avail_heads, one
>                                            * entry per node.
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 4f0e8b2ac8aa..205a60c5f9cb 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -290,64 +290,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>  #endif
>  #define LATENCY_LIMIT          256
>
> -static inline void cluster_set_flag(struct swap_cluster_info *info,
> -       unsigned int flag)
> -{
> -       info->flags = flag;
> -}
> -
> -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> -{
> -       return info->data;
> -}
> -
> -static inline void cluster_set_count(struct swap_cluster_info *info,
> -                                    unsigned int c)
> -{
> -       info->data = c;
> -}
> -
> -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> -                                        unsigned int c, unsigned int f)
> -{
> -       info->flags = f;
> -       info->data = c;
> -}
> -
> -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> -{
> -       return info->data;
> -}
> -
> -static inline void cluster_set_next(struct swap_cluster_info *info,
> -                                   unsigned int n)
> -{
> -       info->data = n;
> -}
> -
> -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> -                                        unsigned int n, unsigned int f)
> -{
> -       info->flags = f;
> -       info->data = n;
> -}
> -
>  static inline bool cluster_is_free(struct swap_cluster_info *info)
>  {
>         return info->flags & CLUSTER_FLAG_FREE;
>  }
>
> -static inline bool cluster_is_null(struct swap_cluster_info *info)
> -{
> -       return info->flags & CLUSTER_FLAG_NEXT_NULL;
> -}
> -
> -static inline void cluster_set_null(struct swap_cluster_info *info)
> -{
> -       info->flags = CLUSTER_FLAG_NEXT_NULL;
> -       info->data = 0;
> -}
> -
>  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
>                                                      unsigned long offset)
>  {
> @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
>                 spin_unlock(&si->lock);
>  }
>
> -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> -{
> -       return cluster_is_null(&list->head);
> -}
> -
> -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> -{
> -       return cluster_next(&list->head);
> -}
> -
> -static void cluster_list_init(struct swap_cluster_list *list)
> -{
> -       cluster_set_null(&list->head);
> -       cluster_set_null(&list->tail);
> -}
> -
> -static void cluster_list_add_tail(struct swap_cluster_list *list,
> -                                 struct swap_cluster_info *ci,
> -                                 unsigned int idx)
> -{
> -       if (cluster_list_empty(list)) {
> -               cluster_set_next_flag(&list->head, idx, 0);
> -               cluster_set_next_flag(&list->tail, idx, 0);
> -       } else {
> -               struct swap_cluster_info *ci_tail;
> -               unsigned int tail = cluster_next(&list->tail);
> -
> -               /*
> -                * Nested cluster lock, but both cluster locks are
> -                * only acquired when we held swap_info_struct->lock
> -                */
> -               ci_tail = ci + tail;
> -               spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> -               cluster_set_next(ci_tail, idx);
> -               spin_unlock(&ci_tail->lock);
> -               cluster_set_next_flag(&list->tail, idx, 0);
> -       }
> -}
> -
> -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> -                                          struct swap_cluster_info *ci)
> -{
> -       unsigned int idx;
> -
> -       idx = cluster_next(&list->head);
> -       if (cluster_next(&list->tail) == idx) {
> -               cluster_set_null(&list->head);
> -               cluster_set_null(&list->tail);
> -       } else
> -               cluster_set_next_flag(&list->head,
> -                                     cluster_next(&ci[idx]), 0);
> -
> -       return idx;
> -}
> -
>  /* Add a cluster to discard list and schedule it to do discard */
>  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> -               unsigned int idx)
> +               struct swap_cluster_info *ci)
>  {
> +       unsigned int idx = ci - si->cluster_info;
>         /*
>          * If scan_swap_map_slots() can't find a free cluster, it will check
>          * si->swap_map directly. To make sure the discarding cluster isn't
> @@ -462,17 +355,16 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>         memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>                         SWAP_MAP_BAD, SWAPFILE_CLUSTER);
>
> -       cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> -
> +       spin_lock_nested(&ci->lock, SINGLE_DEPTH_NESTING);
> +       list_add_tail(&ci->next, &si->discard_clusters);
> +       spin_unlock(&ci->lock);
>         schedule_work(&si->discard_work);
>  }
>
> -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -       struct swap_cluster_info *ci = si->cluster_info;
> -
> -       cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> -       cluster_list_add_tail(&si->free_clusters, ci, idx);
> +       ci->flags = CLUSTER_FLAG_FREE;
> +       list_add_tail(&ci->next, &si->free_clusters);
>  }
>
>  /*
> @@ -481,21 +373,21 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>  */
>  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  {
> -       struct swap_cluster_info *info, *ci;
> +       struct swap_cluster_info *ci;
>         unsigned int idx;
>
> -       info = si->cluster_info;
> -
> -       while (!cluster_list_empty(&si->discard_clusters)) {
> -               idx = cluster_list_del_first(&si->discard_clusters, info);
> +       while (!list_empty(&si->discard_clusters)) {
> +               ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, next);
> +               idx = ci - si->cluster_info;
>                 spin_unlock(&si->lock);
>
>                 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>                                 SWAPFILE_CLUSTER);
>
>                 spin_lock(&si->lock);
> -               ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> -               __free_cluster(si, idx);
> +
> +               spin_lock(&ci->lock);
> +               __free_cluster(si, ci);
>                 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>                                 0, SWAPFILE_CLUSTER);
>                 unlock_cluster(ci);
> @@ -521,20 +413,20 @@ static void swap_users_ref_free(struct percpu_ref *ref)
>         complete(&si->comp);
>  }
>
> -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
> -       struct swap_cluster_info *ci = si->cluster_info;
> +       struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>
> -       VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> -       cluster_list_del_first(&si->free_clusters, ci);
> -       cluster_set_count_flag(ci + idx, 0, 0);
> +       VM_BUG_ON(ci - si->cluster_info != idx);
> +       list_del(&ci->next);
> +       ci->count = 0;
> +       ci->flags = 0;
> +       return ci;
>  }
>
> -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -       struct swap_cluster_info *ci = si->cluster_info + idx;
> -
> -       VM_BUG_ON(cluster_count(ci) != 0);
> +       VM_BUG_ON(ci->count != 0);
>         /*
>          * If the swap is discardable, prepare discard the cluster
>          * instead of free it immediately. The cluster will be freed
> @@ -542,11 +434,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
>          */
>         if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
>             (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> -               swap_cluster_schedule_discard(si, idx);
> +               swap_cluster_schedule_discard(si, ci);
>                 return;
>         }
>
> -       __free_cluster(si, idx);
> +       __free_cluster(si, ci);
>  }
>
>  /*
> @@ -559,15 +451,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
>         unsigned long count)
>  {
>         unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> +       struct swap_cluster_info *ci = cluster_info + idx;
>
>         if (!cluster_info)
>                 return;
> -       if (cluster_is_free(&cluster_info[idx]))
> +       if (cluster_is_free(ci))
>                 alloc_cluster(p, idx);
>
> -       VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> -       cluster_set_count(&cluster_info[idx],
> -               cluster_count(&cluster_info[idx]) + count);
> +       VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> +       ci->count += count;
>  }
>
>  /*
> @@ -581,24 +473,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
>  }
>
>  /*
> - * The cluster corresponding to page_nr decreases one usage. If the usage
> - * counter becomes 0, which means no page in the cluster is in using, we can
> - * optionally discard the cluster and add it to free cluster list.
> + * The cluster ci decreases one usage. If the usage counter becomes 0,
> + * which means no page in the cluster is in using, we can optionally discard
> + * the cluster and add it to free cluster list.
>   */
> -static void dec_cluster_info_page(struct swap_info_struct *p,
> -       struct swap_cluster_info *cluster_info, unsigned long page_nr)
> +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
>  {
> -       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> -
> -       if (!cluster_info)
> +       if (!p->cluster_info)
>                 return;
>
> -       VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> -       cluster_set_count(&cluster_info[idx],
> -               cluster_count(&cluster_info[idx]) - 1);
> +       VM_BUG_ON(ci->count == 0);
> +       ci->count--;
>
> -       if (cluster_count(&cluster_info[idx]) == 0)
> -               free_cluster(p, idx);
> +       if (!ci->count)
> +               free_cluster(p, ci);
>  }
>
>  /*
> @@ -611,10 +499,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
>  {

This whole scan_swap_map_ssd_cluster_conflict function seems not
needed now. free_clusters is a double linked list, so using a cluster
in the middle won't corrupt the list. The comments are still for the
old list design.

>         struct percpu_cluster *percpu_cluster;
>         bool conflict;
> -
> +       struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>         offset /= SWAPFILE_CLUSTER;
> -       conflict = !cluster_list_empty(&si->free_clusters) &&
> -               offset != cluster_list_first(&si->free_clusters) &&
> +       conflict = !list_empty(&si->free_clusters) &&
> +               offset !=  first - si->cluster_info &&
>                 cluster_is_free(&si->cluster_info[offset]);
>
>         if (!conflict)
> @@ -655,10 +543,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>         cluster = this_cpu_ptr(si->percpu_cluster);
>         tmp = cluster->next[order];
>         if (tmp == SWAP_NEXT_INVALID) {
> -               if (!cluster_list_empty(&si->free_clusters)) {
> -                       tmp = cluster_next(&si->free_clusters.head) *
> -                                       SWAPFILE_CLUSTER;
> -               } else if (!cluster_list_empty(&si->discard_clusters)) {
> +               if (!list_empty(&si->free_clusters)) {
> +                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> +                       list_del(&ci->next);
> +                       spin_lock(&ci->lock);

Shouldn't this list_del also be protected by ci->lock? It was
protected in alloc_cluster before, keeping the flag synced with
cluster status so cluster_is_free won't return false positive.

> +                       ci->flags = 0;
> +                       spin_unlock(&ci->lock);
> +                       tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> +               } else if (!list_empty(&si->discard_clusters)) {
>                         /*
>                          * we don't have free cluster but have some clusters in
>                          * discarding, do discard now and reclaim them, then
> @@ -670,7 +562,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>                         goto new_cluster;
>                 } else
>                         return false;
> -       }
> +       } else
> +               ci = si->cluster_info + tmp;

This "else ci = ..." seems wrong, tmp is not an array index, and not
needed either.

>
>         /*
>          * Other CPUs can use our cluster if they can't find a free cluster,
> @@ -1062,8 +955,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>
>         ci = lock_cluster(si, offset);
>         memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> -       cluster_set_count_flag(ci, 0, 0);
> -       free_cluster(si, idx);
> +       ci->count = 0;
> +       ci->flags = 0;
> +       free_cluster(si, ci);
>         unlock_cluster(ci);
>         swap_range_free(si, offset, SWAPFILE_CLUSTER);
>  }
> @@ -1336,7 +1230,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>         count = p->swap_map[offset];
>         VM_BUG_ON(count != SWAP_HAS_CACHE);
>         p->swap_map[offset] = 0;
> -       dec_cluster_info_page(p, p->cluster_info, offset);
> +       dec_cluster_info_page(p, ci);
>         unlock_cluster(ci);
>
>         mem_cgroup_uncharge_swap(entry, 1);
> @@ -2985,8 +2879,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>
>         nr_good_pages = maxpages - 1;   /* omit header page */
>
> -       cluster_list_init(&p->free_clusters);
> -       cluster_list_init(&p->discard_clusters);
> +       INIT_LIST_HEAD(&p->free_clusters);
> +       INIT_LIST_HEAD(&p->discard_clusters);
>
>         for (i = 0; i < swap_header->info.nr_badpages; i++) {
>                 unsigned int page_nr = swap_header->info.badpages[i];
> @@ -3037,14 +2931,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>         for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
>                 j = (k + col) % SWAP_CLUSTER_COLS;
>                 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> +                       struct swap_cluster_info *ci;
>                         idx = i * SWAP_CLUSTER_COLS + j;
> +                       ci = cluster_info + idx;
>                         if (idx >= nr_clusters)
>                                 continue;
> -                       if (cluster_count(&cluster_info[idx]))
> +                       if (ci->count)
>                                 continue;
> -                       cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> -                       cluster_list_add_tail(&p->free_clusters, cluster_info,
> -                                             idx);
> +                       ci->flags = CLUSTER_FLAG_FREE;
> +                       list_add_tail(&ci->next, &p->free_clusters);
>                 }
>         }
>         return nr_extents;
>
> --
> 2.45.1.288.g0e0cd299f1-goog
>
>
Chris Li May 28, 2024, 10:27 p.m. UTC | #2
Hi Kairui,


On Tue, May 28, 2024 at 9:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Sat, May 25, 2024 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > Previously, the swap cluster used a cluster index as a pointer
> > to construct a custom single link list type "swap_cluster_list".
> > The next cluster pointer is shared with the cluster->count.
> > The assumption is that only the free cluster needs to be put
> > on the list.
> >
> > That assumption is not true for mTHP allocators any more. Need
> > to track the non full cluster on the list as well.  Move the
> > current cluster single link list into standard double link list.
> >
> > Remove the cluster getter/setter for accessing the cluster
> > struct member.  Move the cluster locking in the caller function
> > rather than the getter/setter function. That way the locking can
> > protect more than one member, e.g. cluster->flag.
> >
> > Change cluster code to use "struct swap_cluster_info *" to
> > reference the cluster rather than by using index. That is more
> > consistent with the list manipulation. It avoids the repeat
> > adding index to the cluser_info. The code is easier to understand.
> >
> > Remove the cluster next pointer is NULL flag, the double link
> > list can handle the empty list pretty well.
> >
> > The "swap_cluster_info" struct is two pointer bigger, because
> > 512 swap entries share one swap struct, it has very little impact
> > on the average memory usage per swap entry.  Other than the list
> > conversion, there is no real function change in this patch.
> > ---
> >  include/linux/swap.h |  14 ++--
> >  mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
> >  2 files changed, 68 insertions(+), 177 deletions(-)
> >
>
> Hi Chris,
>
> Thanks for this very nice clean up, the code is much easier to read.

Thanks for the review.

See my comments below. I am working on a V2 to address the two issues
identified so far.

BTW, I am pretty happy the patch stats have much more deltes than insert.

>
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 11c53692f65f..0d3906eff3c9 100644
> > --- a/include/linux/swap.hm
> > +++ b/include/linux/swap.h
> > @@ -254,11 +254,12 @@ struct swap_cluster_info {
> >                                  * elements correspond to the swap
> >                                  * cluster
> >                                  */
> > -       unsigned int data:24;
> > +       unsigned int count:16;
> >         unsigned int flags:8;
> > +       struct list_head next;
> >  };
> >  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> > +
> >
> >  /*
> >   * The first page in the swap file is the swap header, which is always marked
> > @@ -283,11 +284,6 @@ struct percpu_cluster {
> >         unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> >  };
> >
> > -struct swap_cluster_list {
> > -       struct swap_cluster_info head;
> > -       struct swap_cluster_info tail;
> > -};
> > -
> >  /*
> >   * The in-memory structure used to track swap areas.
> >   */
> > @@ -300,7 +296,7 @@ struct swap_info_struct {
> >         unsigned int    max;            /* extent of the swap_map */
> >         unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >         struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> > -       struct swap_cluster_list free_clusters; /* free clusters list */
> > +       struct list_head free_clusters; /* free clusters list */
> >         unsigned int lowest_bit;        /* index of first free in swap_map */
> >         unsigned int highest_bit;       /* index of last free in swap_map */
> >         unsigned int pages;             /* total of usable pages of swap */
> > @@ -333,7 +329,7 @@ struct swap_info_struct {
> >                                          * list.
> >                                          */
> >         struct work_struct discard_work; /* discard worker */
> > -       struct swap_cluster_list discard_clusters; /* discard clusters list */
> > +       struct list_head discard_clusters; /* discard clusters list */
> >         struct plist_node avail_lists[]; /*
> >                                            * entries in swap_avail_heads, one
> >                                            * entry per node.
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 4f0e8b2ac8aa..205a60c5f9cb 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -290,64 +290,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
> >  #endif
> >  #define LATENCY_LIMIT          256
> >
> > -static inline void cluster_set_flag(struct swap_cluster_info *info,
> > -       unsigned int flag)
> > -{
> > -       info->flags = flag;
> > -}
> > -
> > -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> > -{
> > -       return info->data;
> > -}
> > -
> > -static inline void cluster_set_count(struct swap_cluster_info *info,
> > -                                    unsigned int c)
> > -{
> > -       info->data = c;
> > -}
> > -
> > -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> > -                                        unsigned int c, unsigned int f)
> > -{
> > -       info->flags = f;
> > -       info->data = c;
> > -}
> > -
> > -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> > -{
> > -       return info->data;
> > -}
> > -
> > -static inline void cluster_set_next(struct swap_cluster_info *info,
> > -                                   unsigned int n)
> > -{
> > -       info->data = n;
> > -}
> > -
> > -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> > -                                        unsigned int n, unsigned int f)
> > -{
> > -       info->flags = f;
> > -       info->data = n;
> > -}
> > -
> >  static inline bool cluster_is_free(struct swap_cluster_info *info)
> >  {
> >         return info->flags & CLUSTER_FLAG_FREE;
> >  }
> >
> > -static inline bool cluster_is_null(struct swap_cluster_info *info)
> > -{
> > -       return info->flags & CLUSTER_FLAG_NEXT_NULL;
> > -}
> > -
> > -static inline void cluster_set_null(struct swap_cluster_info *info)
> > -{
> > -       info->flags = CLUSTER_FLAG_NEXT_NULL;
> > -       info->data = 0;
> > -}
> > -
> >  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
> >                                                      unsigned long offset)
> >  {
> > @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> >                 spin_unlock(&si->lock);
> >  }
> >
> > -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> > -{
> > -       return cluster_is_null(&list->head);
> > -}
> > -
> > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> > -{
> > -       return cluster_next(&list->head);
> > -}
> > -
> > -static void cluster_list_init(struct swap_cluster_list *list)
> > -{
> > -       cluster_set_null(&list->head);
> > -       cluster_set_null(&list->tail);
> > -}
> > -
> > -static void cluster_list_add_tail(struct swap_cluster_list *list,
> > -                                 struct swap_cluster_info *ci,
> > -                                 unsigned int idx)
> > -{
> > -       if (cluster_list_empty(list)) {
> > -               cluster_set_next_flag(&list->head, idx, 0);
> > -               cluster_set_next_flag(&list->tail, idx, 0);
> > -       } else {
> > -               struct swap_cluster_info *ci_tail;
> > -               unsigned int tail = cluster_next(&list->tail);
> > -
> > -               /*
> > -                * Nested cluster lock, but both cluster locks are
> > -                * only acquired when we held swap_info_struct->lock
> > -                */
> > -               ci_tail = ci + tail;
> > -               spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> > -               cluster_set_next(ci_tail, idx);
> > -               spin_unlock(&ci_tail->lock);
> > -               cluster_set_next_flag(&list->tail, idx, 0);
> > -       }
> > -}
> > -
> > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> > -                                          struct swap_cluster_info *ci)
> > -{
> > -       unsigned int idx;
> > -
> > -       idx = cluster_next(&list->head);
> > -       if (cluster_next(&list->tail) == idx) {
> > -               cluster_set_null(&list->head);
> > -               cluster_set_null(&list->tail);
> > -       } else
> > -               cluster_set_next_flag(&list->head,
> > -                                     cluster_next(&ci[idx]), 0);
> > -
> > -       return idx;
> > -}
> > -
> >  /* Add a cluster to discard list and schedule it to do discard */
> >  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> > -               unsigned int idx)
> > +               struct swap_cluster_info *ci)
> >  {
> > +       unsigned int idx = ci - si->cluster_info;
> >         /*
> >          * If scan_swap_map_slots() can't find a free cluster, it will check
> >          * si->swap_map directly. To make sure the discarding cluster isn't
> > @@ -462,17 +355,16 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >         memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                         SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> >
> > -       cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> > -
> > +       spin_lock_nested(&ci->lock, SINGLE_DEPTH_NESTING);
> > +       list_add_tail(&ci->next, &si->discard_clusters);
> > +       spin_unlock(&ci->lock);
> >         schedule_work(&si->discard_work);
> >  }
> >
> > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -       struct swap_cluster_info *ci = si->cluster_info;
> > -
> > -       cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> > -       cluster_list_add_tail(&si->free_clusters, ci, idx);
> > +       ci->flags = CLUSTER_FLAG_FREE;
> > +       list_add_tail(&ci->next, &si->free_clusters);
> >  }
> >
> >  /*
> > @@ -481,21 +373,21 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> >  */
> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
> >  {
> > -       struct swap_cluster_info *info, *ci;
> > +       struct swap_cluster_info *ci;
> >         unsigned int idx;
> >
> > -       info = si->cluster_info;
> > -
> > -       while (!cluster_list_empty(&si->discard_clusters)) {
> > -               idx = cluster_list_del_first(&si->discard_clusters, info);
> > +       while (!list_empty(&si->discard_clusters)) {
> > +               ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, next);
> > +               idx = ci - si->cluster_info;
> >                 spin_unlock(&si->lock);
> >
> >                 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
> >                                 SWAPFILE_CLUSTER);
> >
> >                 spin_lock(&si->lock);
> > -               ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> > -               __free_cluster(si, idx);
> > +
> > +               spin_lock(&ci->lock);
> > +               __free_cluster(si, ci);
> >                 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                                 0, SWAPFILE_CLUSTER);
> >                 unlock_cluster(ci);
> > @@ -521,20 +413,20 @@ static void swap_users_ref_free(struct percpu_ref *ref)
> >         complete(&si->comp);
> >  }
> >
> > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >  {
> > -       struct swap_cluster_info *ci = si->cluster_info;
> > +       struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >
> > -       VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> > -       cluster_list_del_first(&si->free_clusters, ci);
> > -       cluster_set_count_flag(ci + idx, 0, 0);
> > +       VM_BUG_ON(ci - si->cluster_info != idx);
> > +       list_del(&ci->next);
> > +       ci->count = 0;
> > +       ci->flags = 0;
> > +       return ci;
> >  }
> >
> > -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -       struct swap_cluster_info *ci = si->cluster_info + idx;
> > -
> > -       VM_BUG_ON(cluster_count(ci) != 0);
> > +       VM_BUG_ON(ci->count != 0);
> >         /*
> >          * If the swap is discardable, prepare discard the cluster
> >          * instead of free it immediately. The cluster will be freed
> > @@ -542,11 +434,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> >          */
> >         if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
> >             (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> > -               swap_cluster_schedule_discard(si, idx);
> > +               swap_cluster_schedule_discard(si, ci);
> >                 return;
> >         }
> >
> > -       __free_cluster(si, idx);
> > +       __free_cluster(si, ci);
> >  }
> >
> >  /*
> > @@ -559,15 +451,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
> >         unsigned long count)
> >  {
> >         unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > +       struct swap_cluster_info *ci = cluster_info + idx;
> >
> >         if (!cluster_info)
> >                 return;
> > -       if (cluster_is_free(&cluster_info[idx]))
> > +       if (cluster_is_free(ci))
> >                 alloc_cluster(p, idx);
> >
> > -       VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> > -       cluster_set_count(&cluster_info[idx],
> > -               cluster_count(&cluster_info[idx]) + count);
> > +       VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> > +       ci->count += count;
> >  }
> >
> >  /*
> > @@ -581,24 +473,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> >  }
> >
> >  /*
> > - * The cluster corresponding to page_nr decreases one usage. If the usage
> > - * counter becomes 0, which means no page in the cluster is in using, we can
> > - * optionally discard the cluster and add it to free cluster list.
> > + * The cluster ci decreases one usage. If the usage counter becomes 0,
> > + * which means no page in the cluster is in using, we can optionally discard
> > + * the cluster and add it to free cluster list.
> >   */
> > -static void dec_cluster_info_page(struct swap_info_struct *p,
> > -       struct swap_cluster_info *cluster_info, unsigned long page_nr)
> > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> >  {
> > -       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > -
> > -       if (!cluster_info)
> > +       if (!p->cluster_info)
> >                 return;
> >
> > -       VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> > -       cluster_set_count(&cluster_info[idx],
> > -               cluster_count(&cluster_info[idx]) - 1);
> > +       VM_BUG_ON(ci->count == 0);
> > +       ci->count--;
> >
> > -       if (cluster_count(&cluster_info[idx]) == 0)
> > -               free_cluster(p, idx);
> > +       if (!ci->count)
> > +               free_cluster(p, ci);
> >  }
> >
> >  /*
> > @@ -611,10 +499,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
> >  {
>
> This whole scan_swap_map_ssd_cluster_conflict function seems not
> needed now. free_clusters is a double linked list, so using a cluster
> in the middle won't corrupt the list. The comments are still for the
> old list design.

I was debating removing the cluster_conflict() as well and found out
it can't be removed until we change the order 0 allocations also use
clusters.
There can still be conflict because the order 0 allocations just do
the bruce force scan of swap_map[] when try_ssd fails. This causes
other problems as well. As far as I can tell, the conflict can still
happen.

>
> >         struct percpu_cluster *percpu_cluster;
> >         bool conflict;
> > -
> > +       struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >         offset /= SWAPFILE_CLUSTER;
> > -       conflict = !cluster_list_empty(&si->free_clusters) &&
> > -               offset != cluster_list_first(&si->free_clusters) &&
> > +       conflict = !list_empty(&si->free_clusters) &&
> > +               offset !=  first - si->cluster_info &&
> >                 cluster_is_free(&si->cluster_info[offset]);
> >
> >         if (!conflict)
> > @@ -655,10 +543,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >         cluster = this_cpu_ptr(si->percpu_cluster);
> >         tmp = cluster->next[order];
> >         if (tmp == SWAP_NEXT_INVALID) {
> > -               if (!cluster_list_empty(&si->free_clusters)) {
> > -                       tmp = cluster_next(&si->free_clusters.head) *
> > -                                       SWAPFILE_CLUSTER;
> > -               } else if (!cluster_list_empty(&si->discard_clusters)) {
> > +               if (!list_empty(&si->free_clusters)) {
> > +                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> > +                       list_del(&ci->next);
> > +                       spin_lock(&ci->lock);
>
> Shouldn't this list_del also be protected by ci->lock? It was
> protected in alloc_cluster before, keeping the flag synced with
> cluster status so cluster_is_free won't return false positive.

The list add and list del are protected by Si->lock not by cluster lock.
Previously I wanted to use cluster->lock to protect it and realized
that adding/deleting the cluster to/from the list will change three
clusters. (current, prev, next). We need to get three cluster locks.
We might change to a per list spinlock. e.g. one lock for one list to
reduce the contention on Si->lock. However, per cluster lock is not
enough if we only take one cluster lock.

>
> > +                       ci->flags = 0;
> > +                       spin_unlock(&ci->lock);
> > +                       tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> > +               } else if (!list_empty(&si->discard_clusters)) {
> >                         /*
> >                          * we don't have free cluster but have some clusters in
> >                          * discarding, do discard now and reclaim them, then
> > @@ -670,7 +562,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >                         goto new_cluster;
> >                 } else
> >                         return false;
> > -       }
> > +       } else
> > +               ci = si->cluster_info + tmp;
>
> This "else ci = ..." seems wrong, tmp is not an array index, and not
> needed either.

Yes, there is a bug there, pointed out by OPPO as well. It should be
ci = si->cluster_info + (tmp/ SWAPFILE_CLUSTER);

"tmp" is needed because "tmp" or " cluster->next[order]" keep track of
the current cluster allocation offset,
in the per cpu cluster struct.

BTW, In my V2 I have changed "tmp" to "offset" and previous "offset"
to "retoffset" to make it more obvious. "tmp" does not give much
information about what it really does.

Chris

>
> >
> >         /*
> >          * Other CPUs can use our cluster if they can't find a free cluster,
> > @@ -1062,8 +955,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >
> >         ci = lock_cluster(si, offset);
> >         memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> > -       cluster_set_count_flag(ci, 0, 0);
> > -       free_cluster(si, idx);
> > +       ci->count = 0;
> > +       ci->flags = 0;
> > +       free_cluster(si, ci);
> >         unlock_cluster(ci);
> >         swap_range_free(si, offset, SWAPFILE_CLUSTER);
> >  }
> > @@ -1336,7 +1230,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> >         count = p->swap_map[offset];
> >         VM_BUG_ON(count != SWAP_HAS_CACHE);
> >         p->swap_map[offset] = 0;
> > -       dec_cluster_info_page(p, p->cluster_info, offset);
> > +       dec_cluster_info_page(p, ci);
> >         unlock_cluster(ci);
> >
> >         mem_cgroup_uncharge_swap(entry, 1);
> > @@ -2985,8 +2879,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >
> >         nr_good_pages = maxpages - 1;   /* omit header page */
> >
> > -       cluster_list_init(&p->free_clusters);
> > -       cluster_list_init(&p->discard_clusters);
> > +       INIT_LIST_HEAD(&p->free_clusters);
> > +       INIT_LIST_HEAD(&p->discard_clusters);
> >
> >         for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >                 unsigned int page_nr = swap_header->info.badpages[i];
> > @@ -3037,14 +2931,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >         for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
> >                 j = (k + col) % SWAP_CLUSTER_COLS;
> >                 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> > +                       struct swap_cluster_info *ci;
> >                         idx = i * SWAP_CLUSTER_COLS + j;
> > +                       ci = cluster_info + idx;
> >                         if (idx >= nr_clusters)
> >                                 continue;
> > -                       if (cluster_count(&cluster_info[idx]))
> > +                       if (ci->count)
> >                                 continue;
> > -                       cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> > -                       cluster_list_add_tail(&p->free_clusters, cluster_info,
> > -                                             idx);
> > +                       ci->flags = CLUSTER_FLAG_FREE;
> > +                       list_add_tail(&ci->next, &p->free_clusters);
> >                 }
> >         }
> >         return nr_extents;
> >
> > --
> > 2.45.1.288.g0e0cd299f1-goog
> >
> >
>
Chris Li May 29, 2024, 12:50 a.m. UTC | #3
On Tue, May 28, 2024 at 3:27 PM Chris Li <chriscli@google.com> wrote:
> > > @@ -670,7 +562,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> > >                         goto new_cluster;
> > >                 } else
> > >                         return false;
> > > -       }
> > > +       } else
> > > +               ci = si->cluster_info + tmp;
> >
> > This "else ci = ..." seems wrong, tmp is not an array index, and not
> > needed either.
>
> Yes, there is a bug there, pointed out by OPPO as well. It should be
> ci = si->cluster_info + (tmp/ SWAPFILE_CLUSTER);
>
> "tmp" is needed because "tmp" or " cluster->next[order]" keep track of
> the current cluster allocation offset,
> in the per cpu cluster struct.

Hi Kairui,


Actually, you are right, the "ci" is not used here. That is why that
ci out of bound error does not trigger kernel OOPS.
We can delete that else line completely.

Chris
Huang, Ying May 29, 2024, 8:46 a.m. UTC | #4
Chris Li <chrisl@kernel.org> writes:

> Previously, the swap cluster used a cluster index as a pointer
> to construct a custom single link list type "swap_cluster_list".
> The next cluster pointer is shared with the cluster->count.
> The assumption is that only the free cluster needs to be put
> on the list.
>
> That assumption is not true for mTHP allocators any more.

I think that the words aren't correct here.  mTHP swap entry allocators
can work with current cluster definition.

> Need to track the non full cluster on the list as well.  Move the
> current cluster single link list into standard double link list.

It's an optimization to track non-full cluster with a list.

I understand that you want to change cluster list definition.  I just
feel the wording isn't accurate.

> Remove the cluster getter/setter for accessing the cluster
> struct member.  Move the cluster locking in the caller function
> rather than the getter/setter function. That way the locking can
> protect more than one member, e.g. cluster->flag.

Sorry, I don't understand the locking in above words.  I don't find that
we lock/unlock in the original getter/setter functions.  I found that
the cluster locking rule for cluster list is changed.  Better to make
this explicit.

> Change cluster code to use "struct swap_cluster_info *" to
> reference the cluster rather than by using index. That is more
> consistent with the list manipulation. It avoids the repeat
> adding index to the cluser_info. The code is easier to understand.
>
> Remove the cluster next pointer is NULL flag, the double link
> list can handle the empty list pretty well.
>
> The "swap_cluster_info" struct is two pointer bigger, because
> 512 swap entries share one swap struct, it has very little impact
> on the average memory usage per swap entry.  Other than the list
> conversion, there is no real function change in this patch.

On 64bit platform, the size of swap_cluster_info increases from 8 bytes
to 24 bytes.  For a 1TB swap device, the memory usage will increase from
4MB to 12MB.  This looks OK for me.

Another choice is to use a customized double linked list using "unsigned
int" as pointer to cluster.  That will reduce the size of cluster to 16
bytes.  But it may be not necessary to do that.

Anyway, I think that it's better to add more calculation in changelog
for memory usage increment.

> ---
>  include/linux/swap.h |  14 ++--
>  mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
>  2 files changed, 68 insertions(+), 177 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 11c53692f65f..0d3906eff3c9 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -254,11 +254,12 @@ struct swap_cluster_info {
>  				 * elements correspond to the swap
>  				 * cluster
>  				 */
> -	unsigned int data:24;
> +	unsigned int count:16;
>  	unsigned int flags:8;

If we use 16bits and 8 bits in bit fields, why not just use u8 and u16
instead?

> +	struct list_head next;

"next" isn't a good naming because prev pointer is in list_head too.
The common naming is "list".

Need to revise comments for swap_cluster_info.lock and add the locking
rule comments for swap_cluster_info.next.

>  };
>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> +
>  
>  /*
>   * The first page in the swap file is the swap header, which is always marked
> @@ -283,11 +284,6 @@ struct percpu_cluster {
>  	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>  };
>  
> -struct swap_cluster_list {
> -	struct swap_cluster_info head;
> -	struct swap_cluster_info tail;
> -};
> -
>  /*
>   * The in-memory structure used to track swap areas.
>   */
> @@ -300,7 +296,7 @@ struct swap_info_struct {
>  	unsigned int	max;		/* extent of the swap_map */
>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
>  	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> -	struct swap_cluster_list free_clusters; /* free clusters list */
> +	struct list_head free_clusters; /* free clusters list */
>  	unsigned int lowest_bit;	/* index of first free in swap_map */
>  	unsigned int highest_bit;	/* index of last free in swap_map */
>  	unsigned int pages;		/* total of usable pages of swap */
> @@ -333,7 +329,7 @@ struct swap_info_struct {
>  					 * list.
>  					 */
>  	struct work_struct discard_work; /* discard worker */
> -	struct swap_cluster_list discard_clusters; /* discard clusters list */
> +	struct list_head discard_clusters; /* discard clusters list */
>  	struct plist_node avail_lists[]; /*
>  					   * entries in swap_avail_heads, one
>  					   * entry per node.
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 4f0e8b2ac8aa..205a60c5f9cb 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -290,64 +290,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>  #endif
>  #define LATENCY_LIMIT		256
>  
> -static inline void cluster_set_flag(struct swap_cluster_info *info,
> -	unsigned int flag)
> -{
> -	info->flags = flag;
> -}
> -
> -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> -{
> -	return info->data;
> -}
> -
> -static inline void cluster_set_count(struct swap_cluster_info *info,
> -				     unsigned int c)
> -{
> -	info->data = c;
> -}
> -
> -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> -					 unsigned int c, unsigned int f)
> -{
> -	info->flags = f;
> -	info->data = c;
> -}
> -
> -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> -{
> -	return info->data;
> -}
> -
> -static inline void cluster_set_next(struct swap_cluster_info *info,
> -				    unsigned int n)
> -{
> -	info->data = n;
> -}
> -
> -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> -					 unsigned int n, unsigned int f)
> -{
> -	info->flags = f;
> -	info->data = n;
> -}
> -
>  static inline bool cluster_is_free(struct swap_cluster_info *info)
>  {
>  	return info->flags & CLUSTER_FLAG_FREE;
>  }
>  
> -static inline bool cluster_is_null(struct swap_cluster_info *info)
> -{
> -	return info->flags & CLUSTER_FLAG_NEXT_NULL;
> -}
> -
> -static inline void cluster_set_null(struct swap_cluster_info *info)
> -{
> -	info->flags = CLUSTER_FLAG_NEXT_NULL;
> -	info->data = 0;
> -}
> -
>  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
>  						     unsigned long offset)
>  {
> @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
>  		spin_unlock(&si->lock);
>  }
>  
> -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> -{
> -	return cluster_is_null(&list->head);
> -}
> -
> -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> -{
> -	return cluster_next(&list->head);
> -}
> -
> -static void cluster_list_init(struct swap_cluster_list *list)
> -{
> -	cluster_set_null(&list->head);
> -	cluster_set_null(&list->tail);
> -}
> -
> -static void cluster_list_add_tail(struct swap_cluster_list *list,
> -				  struct swap_cluster_info *ci,
> -				  unsigned int idx)
> -{
> -	if (cluster_list_empty(list)) {
> -		cluster_set_next_flag(&list->head, idx, 0);
> -		cluster_set_next_flag(&list->tail, idx, 0);
> -	} else {
> -		struct swap_cluster_info *ci_tail;
> -		unsigned int tail = cluster_next(&list->tail);
> -
> -		/*
> -		 * Nested cluster lock, but both cluster locks are
> -		 * only acquired when we held swap_info_struct->lock
> -		 */
> -		ci_tail = ci + tail;
> -		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> -		cluster_set_next(ci_tail, idx);
> -		spin_unlock(&ci_tail->lock);
> -		cluster_set_next_flag(&list->tail, idx, 0);
> -	}
> -}
> -
> -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> -					   struct swap_cluster_info *ci)
> -{
> -	unsigned int idx;
> -
> -	idx = cluster_next(&list->head);
> -	if (cluster_next(&list->tail) == idx) {
> -		cluster_set_null(&list->head);
> -		cluster_set_null(&list->tail);
> -	} else
> -		cluster_set_next_flag(&list->head,
> -				      cluster_next(&ci[idx]), 0);
> -
> -	return idx;
> -}
> -
>  /* Add a cluster to discard list and schedule it to do discard */
>  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> -		unsigned int idx)
> +		struct swap_cluster_info *ci)
>  {
> +	unsigned int idx = ci - si->cluster_info;
>  	/*
>  	 * If scan_swap_map_slots() can't find a free cluster, it will check
>  	 * si->swap_map directly. To make sure the discarding cluster isn't
> @@ -462,17 +355,16 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
>  
> -	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> -
> +	spin_lock_nested(&ci->lock, SINGLE_DEPTH_NESTING);

If we don't use ci->lock to protect ci->next, we don't need spin_lock here.

> +	list_add_tail(&ci->next, &si->discard_clusters);
> +	spin_unlock(&ci->lock);
>  	schedule_work(&si->discard_work);
>  }
>  
> -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info;
> -
> -	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> -	cluster_list_add_tail(&si->free_clusters, ci, idx);
> +	ci->flags = CLUSTER_FLAG_FREE;
> +	list_add_tail(&ci->next, &si->free_clusters);
>  }
>  
>  /*
> @@ -481,21 +373,21 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>  */
>  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  {
> -	struct swap_cluster_info *info, *ci;
> +	struct swap_cluster_info *ci;
>  	unsigned int idx;
>  
> -	info = si->cluster_info;
> -
> -	while (!cluster_list_empty(&si->discard_clusters)) {
> -		idx = cluster_list_del_first(&si->discard_clusters, info);
> +	while (!list_empty(&si->discard_clusters)) {
> +		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, next);
> +		idx = ci - si->cluster_info;
>  		spin_unlock(&si->lock);
>  
>  		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>  				SWAPFILE_CLUSTER);
>  
>  		spin_lock(&si->lock);
> -		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> -		__free_cluster(si, idx);
> +
> +		spin_lock(&ci->lock);
> +		__free_cluster(si, ci);
>  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  				0, SWAPFILE_CLUSTER);
>  		unlock_cluster(ci);
> @@ -521,20 +413,20 @@ static void swap_users_ref_free(struct percpu_ref *ref)
>  	complete(&si->comp);
>  }
>  
> -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info;
> +	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>  
> -	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> -	cluster_list_del_first(&si->free_clusters, ci);
> -	cluster_set_count_flag(ci + idx, 0, 0);
> +	VM_BUG_ON(ci - si->cluster_info != idx);
> +	list_del(&ci->next);
> +	ci->count = 0;
> +	ci->flags = 0;
> +	return ci;
>  }
>  
> -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info + idx;
> -
> -	VM_BUG_ON(cluster_count(ci) != 0);
> +	VM_BUG_ON(ci->count != 0);
>  	/*
>  	 * If the swap is discardable, prepare discard the cluster
>  	 * instead of free it immediately. The cluster will be freed
> @@ -542,11 +434,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
>  	 */
>  	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
>  	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> -		swap_cluster_schedule_discard(si, idx);
> +		swap_cluster_schedule_discard(si, ci);
>  		return;
>  	}
>  
> -	__free_cluster(si, idx);
> +	__free_cluster(si, ci);
>  }
>  
>  /*
> @@ -559,15 +451,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
>  	unsigned long count)
>  {
>  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> +	struct swap_cluster_info *ci = cluster_info + idx;
>  
>  	if (!cluster_info)
>  		return;
> -	if (cluster_is_free(&cluster_info[idx]))
> +	if (cluster_is_free(ci))
>  		alloc_cluster(p, idx);
>  
> -	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> -	cluster_set_count(&cluster_info[idx],
> -		cluster_count(&cluster_info[idx]) + count);
> +	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> +	ci->count += count;
>  }
>  
>  /*
> @@ -581,24 +473,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
>  }
>  
>  /*
> - * The cluster corresponding to page_nr decreases one usage. If the usage
> - * counter becomes 0, which means no page in the cluster is in using, we can
> - * optionally discard the cluster and add it to free cluster list.
> + * The cluster ci decreases one usage. If the usage counter becomes 0,
> + * which means no page in the cluster is in using, we can optionally discard
> + * the cluster and add it to free cluster list.
>   */
> -static void dec_cluster_info_page(struct swap_info_struct *p,
> -	struct swap_cluster_info *cluster_info, unsigned long page_nr)
> +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
>  {
> -	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> -
> -	if (!cluster_info)
> +	if (!p->cluster_info)
>  		return;
>  
> -	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> -	cluster_set_count(&cluster_info[idx],
> -		cluster_count(&cluster_info[idx]) - 1);
> +	VM_BUG_ON(ci->count == 0);
> +	ci->count--;
>  
> -	if (cluster_count(&cluster_info[idx]) == 0)
> -		free_cluster(p, idx);
> +	if (!ci->count)
> +		free_cluster(p, ci);
>  }
>  
>  /*
> @@ -611,10 +499,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
>  {
>  	struct percpu_cluster *percpu_cluster;
>  	bool conflict;
> -
> +	struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>  	offset /= SWAPFILE_CLUSTER;
> -	conflict = !cluster_list_empty(&si->free_clusters) &&
> -		offset != cluster_list_first(&si->free_clusters) &&
> +	conflict = !list_empty(&si->free_clusters) &&
> +		offset !=  first - si->cluster_info &&
>  		cluster_is_free(&si->cluster_info[offset]);
>  
>  	if (!conflict)
> @@ -655,10 +543,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>  	cluster = this_cpu_ptr(si->percpu_cluster);
>  	tmp = cluster->next[order];
>  	if (tmp == SWAP_NEXT_INVALID) {
> -		if (!cluster_list_empty(&si->free_clusters)) {
> -			tmp = cluster_next(&si->free_clusters.head) *
> -					SWAPFILE_CLUSTER;
> -		} else if (!cluster_list_empty(&si->discard_clusters)) {
> +		if (!list_empty(&si->free_clusters)) {
> +			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> +			list_del(&ci->next);
> +			spin_lock(&ci->lock);
> +			ci->flags = 0;
> +			spin_unlock(&ci->lock);
> +			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> +		} else if (!list_empty(&si->discard_clusters)) {
>  			/*
>  			 * we don't have free cluster but have some clusters in
>  			 * discarding, do discard now and reclaim them, then
> @@ -670,7 +562,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>  			goto new_cluster;
>  		} else
>  			return false;
> -	}
> +	} else
> +		ci = si->cluster_info + tmp;
>  
>  	/*
>  	 * Other CPUs can use our cluster if they can't find a free cluster,
> @@ -1062,8 +955,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>  
>  	ci = lock_cluster(si, offset);
>  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> -	cluster_set_count_flag(ci, 0, 0);
> -	free_cluster(si, idx);
> +	ci->count = 0;
> +	ci->flags = 0;
> +	free_cluster(si, ci);
>  	unlock_cluster(ci);
>  	swap_range_free(si, offset, SWAPFILE_CLUSTER);
>  }
> @@ -1336,7 +1230,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>  	count = p->swap_map[offset];
>  	VM_BUG_ON(count != SWAP_HAS_CACHE);
>  	p->swap_map[offset] = 0;
> -	dec_cluster_info_page(p, p->cluster_info, offset);
> +	dec_cluster_info_page(p, ci);
>  	unlock_cluster(ci);
>  
>  	mem_cgroup_uncharge_swap(entry, 1);
> @@ -2985,8 +2879,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  
>  	nr_good_pages = maxpages - 1;	/* omit header page */
>  
> -	cluster_list_init(&p->free_clusters);
> -	cluster_list_init(&p->discard_clusters);
> +	INIT_LIST_HEAD(&p->free_clusters);
> +	INIT_LIST_HEAD(&p->discard_clusters);
>  
>  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
>  		unsigned int page_nr = swap_header->info.badpages[i];
> @@ -3037,14 +2931,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
>  		j = (k + col) % SWAP_CLUSTER_COLS;
>  		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> +			struct swap_cluster_info *ci;
>  			idx = i * SWAP_CLUSTER_COLS + j;
> +			ci = cluster_info + idx;
>  			if (idx >= nr_clusters)
>  				continue;
> -			if (cluster_count(&cluster_info[idx]))
> +			if (ci->count)
>  				continue;
> -			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> -			cluster_list_add_tail(&p->free_clusters, cluster_info,
> -					      idx);
> +			ci->flags = CLUSTER_FLAG_FREE;
> +			list_add_tail(&ci->next, &p->free_clusters);
>  		}
>  	}
>  	return nr_extents;

--
Best Regards,
Huang, Ying
Chris Li May 30, 2024, 9:49 p.m. UTC | #5
On Wed, May 29, 2024 at 1:48 AM Huang, Ying <ying.huang@intel.com> wrote:
>
> Chris Li <chrisl@kernel.org> writes:
>
> > Previously, the swap cluster used a cluster index as a pointer
> > to construct a custom single link list type "swap_cluster_list".
> > The next cluster pointer is shared with the cluster->count.
> > The assumption is that only the free cluster needs to be put
> > on the list.
> >
> > That assumption is not true for mTHP allocators any more.
>
> I think that the words aren't correct here.  mTHP swap entry allocators
> can work with current cluster definition.

The current behavior is very problematic though:

If we only allocate and free order 4 swap entries, nothing else. After
a while, the free cluster will be used up, the swap entry allocation
will fail even though there is a lot of swap space left.

> > Need to track the non full cluster on the list as well.  Move the
> > current cluster single link list into standard double link list.
>
> It's an optimization to track non-full cluster with a list.
>
> I understand that you want to change cluster list definition.  I just

In my mind, I was changing the list implementation so it can be
tracked non free cluster as well.

> feel the wording isn't accurate.

Help me improve it. I am happy to adjust the wording in V2, you can
provide more feedback then.

>
> > Remove the cluster getter/setter for accessing the cluster
> > struct member.  Move the cluster locking in the caller function
> > rather than the getter/setter function. That way the locking can
> > protect more than one member, e.g. cluster->flag.
>
> Sorry, I don't understand the locking in above words.  I don't find that
> we lock/unlock in the original getter/setter functions.  I found that
> the cluster locking rule for cluster list is changed.  Better to make
> this explicit.

The original cluster single link list add/remove will require si->lock
protection as well, because the list head and tail pointer are outside
of the cluster pointer.
In this regard, the cluster double link list locking rule is very
similar. Yes, I move the list_del() outside of the cluster lock, is
that what you are referring to as the locking change?

> > Change cluster code to use "struct swap_cluster_info *" to
> > reference the cluster rather than by using index. That is more
> > consistent with the list manipulation. It avoids the repeat
> > adding index to the cluser_info. The code is easier to understand.
> >
> > Remove the cluster next pointer is NULL flag, the double link
> > list can handle the empty list pretty well.
> >
> > The "swap_cluster_info" struct is two pointer bigger, because
> > 512 swap entries share one swap struct, it has very little impact
> > on the average memory usage per swap entry.  Other than the list
> > conversion, there is no real function change in this patch.
>
> On 64bit platform, the size of swap_cluster_info increases from 8 bytes
> to 24 bytes.  For a 1TB swap device, the memory usage will increase from
> 4MB to 12MB.  This looks OK for me.

Will add the size change calculation in V2 and have you review it again.

>
> Another choice is to use a customized double linked list using "unsigned
> int" as pointer to cluster.  That will reduce the size of cluster to 16
> bytes.  But it may be not necessary to do that.

We can always do that as a follow up step to optimize the 24 byte to
16 byte, at the price of more code complicity.
The trick part is the link list head, it is not part of the cluster
array, it does not have an index, and will need a special handle for
that.

>
> Anyway, I think that it's better to add more calculation in changelog
> for memory usage increment.

Sure, I will adjust the commit message in V2.

Chris

>
> > ---
> >  include/linux/swap.h |  14 ++--
> >  mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
> >  2 files changed, 68 insertions(+), 177 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 11c53692f65f..0d3906eff3c9 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -254,11 +254,12 @@ struct swap_cluster_info {
> >                                * elements correspond to the swap
> >                                * cluster
> >                                */
> > -     unsigned int data:24;
> > +     unsigned int count:16;
> >       unsigned int flags:8;
>
> If we use 16bits and 8 bits in bit fields, why not just use u8 and u16
> instead?
Not sure about the

>
> > +     struct list_head next;
>
> "next" isn't a good naming because prev pointer is in list_head too.
> The common naming is "list".

Sure, I can change it to "list".

>
> Need to revise comments for swap_cluster_info.lock and add the locking
> rule comments for swap_cluster_info.next.

Will do.

>
> >  };
> >  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> > +
> >
> >  /*
> >   * The first page in the swap file is the swap header, which is always marked
> > @@ -283,11 +284,6 @@ struct percpu_cluster {
> >       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> >  };
> >
> > -struct swap_cluster_list {
> > -     struct swap_cluster_info head;
> > -     struct swap_cluster_info tail;
> > -};
> > -
> >  /*
> >   * The in-memory structure used to track swap areas.
> >   */
> > @@ -300,7 +296,7 @@ struct swap_info_struct {
> >       unsigned int    max;            /* extent of the swap_map */
> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> > -     struct swap_cluster_list free_clusters; /* free clusters list */
> > +     struct list_head free_clusters; /* free clusters list */
> >       unsigned int lowest_bit;        /* index of first free in swap_map */
> >       unsigned int highest_bit;       /* index of last free in swap_map */
> >       unsigned int pages;             /* total of usable pages of swap */
> > @@ -333,7 +329,7 @@ struct swap_info_struct {
> >                                        * list.
> >                                        */
> >       struct work_struct discard_work; /* discard worker */
> > -     struct swap_cluster_list discard_clusters; /* discard clusters list */
> > +     struct list_head discard_clusters; /* discard clusters list */
> >       struct plist_node avail_lists[]; /*
> >                                          * entries in swap_avail_heads, one
> >                                          * entry per node.
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 4f0e8b2ac8aa..205a60c5f9cb 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -290,64 +290,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
> >  #endif
> >  #define LATENCY_LIMIT                256
> >
> > -static inline void cluster_set_flag(struct swap_cluster_info *info,
> > -     unsigned int flag)
> > -{
> > -     info->flags = flag;
> > -}
> > -
> > -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> > -{
> > -     return info->data;
> > -}
> > -
> > -static inline void cluster_set_count(struct swap_cluster_info *info,
> > -                                  unsigned int c)
> > -{
> > -     info->data = c;
> > -}
> > -
> > -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> > -                                      unsigned int c, unsigned int f)
> > -{
> > -     info->flags = f;
> > -     info->data = c;
> > -}
> > -
> > -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> > -{
> > -     return info->data;
> > -}
> > -
> > -static inline void cluster_set_next(struct swap_cluster_info *info,
> > -                                 unsigned int n)
> > -{
> > -     info->data = n;
> > -}
> > -
> > -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> > -                                      unsigned int n, unsigned int f)
> > -{
> > -     info->flags = f;
> > -     info->data = n;
> > -}
> > -
> >  static inline bool cluster_is_free(struct swap_cluster_info *info)
> >  {
> >       return info->flags & CLUSTER_FLAG_FREE;
> >  }
> >
> > -static inline bool cluster_is_null(struct swap_cluster_info *info)
> > -{
> > -     return info->flags & CLUSTER_FLAG_NEXT_NULL;
> > -}
> > -
> > -static inline void cluster_set_null(struct swap_cluster_info *info)
> > -{
> > -     info->flags = CLUSTER_FLAG_NEXT_NULL;
> > -     info->data = 0;
> > -}
> > -
> >  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
> >                                                    unsigned long offset)
> >  {
> > @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> >               spin_unlock(&si->lock);
> >  }
> >
> > -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> > -{
> > -     return cluster_is_null(&list->head);
> > -}
> > -
> > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> > -{
> > -     return cluster_next(&list->head);
> > -}
> > -
> > -static void cluster_list_init(struct swap_cluster_list *list)
> > -{
> > -     cluster_set_null(&list->head);
> > -     cluster_set_null(&list->tail);
> > -}
> > -
> > -static void cluster_list_add_tail(struct swap_cluster_list *list,
> > -                               struct swap_cluster_info *ci,
> > -                               unsigned int idx)
> > -{
> > -     if (cluster_list_empty(list)) {
> > -             cluster_set_next_flag(&list->head, idx, 0);
> > -             cluster_set_next_flag(&list->tail, idx, 0);
> > -     } else {
> > -             struct swap_cluster_info *ci_tail;
> > -             unsigned int tail = cluster_next(&list->tail);
> > -
> > -             /*
> > -              * Nested cluster lock, but both cluster locks are
> > -              * only acquired when we held swap_info_struct->lock
> > -              */
> > -             ci_tail = ci + tail;
> > -             spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> > -             cluster_set_next(ci_tail, idx);
> > -             spin_unlock(&ci_tail->lock);
> > -             cluster_set_next_flag(&list->tail, idx, 0);
> > -     }
> > -}
> > -
> > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> > -                                        struct swap_cluster_info *ci)
> > -{
> > -     unsigned int idx;
> > -
> > -     idx = cluster_next(&list->head);
> > -     if (cluster_next(&list->tail) == idx) {
> > -             cluster_set_null(&list->head);
> > -             cluster_set_null(&list->tail);
> > -     } else
> > -             cluster_set_next_flag(&list->head,
> > -                                   cluster_next(&ci[idx]), 0);
> > -
> > -     return idx;
> > -}
> > -
> >  /* Add a cluster to discard list and schedule it to do discard */
> >  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> > -             unsigned int idx)
> > +             struct swap_cluster_info *ci)
> >  {
> > +     unsigned int idx = ci - si->cluster_info;
> >       /*
> >        * If scan_swap_map_slots() can't find a free cluster, it will check
> >        * si->swap_map directly. To make sure the discarding cluster isn't
> > @@ -462,17 +355,16 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> >
> > -     cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> > -
> > +     spin_lock_nested(&ci->lock, SINGLE_DEPTH_NESTING);
>
> If we don't use ci->lock to protect ci->next, we don't need spin_lock here.

Good point. Thanks.

>
> > +     list_add_tail(&ci->next, &si->discard_clusters);
> > +     spin_unlock(&ci->lock);
> >       schedule_work(&si->discard_work);
> >  }
> >
> > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info;
> > -
> > -     cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> > -     cluster_list_add_tail(&si->free_clusters, ci, idx);
> > +     ci->flags = CLUSTER_FLAG_FREE;
> > +     list_add_tail(&ci->next, &si->free_clusters);
> >  }
> >
> >  /*
> > @@ -481,21 +373,21 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> >  */
> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
> >  {
> > -     struct swap_cluster_info *info, *ci;
> > +     struct swap_cluster_info *ci;
> >       unsigned int idx;
> >
> > -     info = si->cluster_info;
> > -
> > -     while (!cluster_list_empty(&si->discard_clusters)) {
> > -             idx = cluster_list_del_first(&si->discard_clusters, info);
> > +     while (!list_empty(&si->discard_clusters)) {
> > +             ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, next);
> > +             idx = ci - si->cluster_info;
> >               spin_unlock(&si->lock);
> >
> >               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
> >                               SWAPFILE_CLUSTER);
> >
> >               spin_lock(&si->lock);
> > -             ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> > -             __free_cluster(si, idx);
> > +
> > +             spin_lock(&ci->lock);
> > +             __free_cluster(si, ci);
> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                               0, SWAPFILE_CLUSTER);
> >               unlock_cluster(ci);
> > @@ -521,20 +413,20 @@ static void swap_users_ref_free(struct percpu_ref *ref)
> >       complete(&si->comp);
> >  }
> >
> > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info;
> > +     struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >
> > -     VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> > -     cluster_list_del_first(&si->free_clusters, ci);
> > -     cluster_set_count_flag(ci + idx, 0, 0);
> > +     VM_BUG_ON(ci - si->cluster_info != idx);
> > +     list_del(&ci->next);
> > +     ci->count = 0;
> > +     ci->flags = 0;
> > +     return ci;
> >  }
> >
> > -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info + idx;
> > -
> > -     VM_BUG_ON(cluster_count(ci) != 0);
> > +     VM_BUG_ON(ci->count != 0);
> >       /*
> >        * If the swap is discardable, prepare discard the cluster
> >        * instead of free it immediately. The cluster will be freed
> > @@ -542,11 +434,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> >        */
> >       if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
> >           (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> > -             swap_cluster_schedule_discard(si, idx);
> > +             swap_cluster_schedule_discard(si, ci);
> >               return;
> >       }
> >
> > -     __free_cluster(si, idx);
> > +     __free_cluster(si, ci);
> >  }
> >
> >  /*
> > @@ -559,15 +451,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
> >       unsigned long count)
> >  {
> >       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > +     struct swap_cluster_info *ci = cluster_info + idx;
> >
> >       if (!cluster_info)
> >               return;
> > -     if (cluster_is_free(&cluster_info[idx]))
> > +     if (cluster_is_free(ci))
> >               alloc_cluster(p, idx);
> >
> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> > -     cluster_set_count(&cluster_info[idx],
> > -             cluster_count(&cluster_info[idx]) + count);
> > +     VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> > +     ci->count += count;
> >  }
> >
> >  /*
> > @@ -581,24 +473,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> >  }
> >
> >  /*
> > - * The cluster corresponding to page_nr decreases one usage. If the usage
> > - * counter becomes 0, which means no page in the cluster is in using, we can
> > - * optionally discard the cluster and add it to free cluster list.
> > + * The cluster ci decreases one usage. If the usage counter becomes 0,
> > + * which means no page in the cluster is in using, we can optionally discard
> > + * the cluster and add it to free cluster list.
> >   */
> > -static void dec_cluster_info_page(struct swap_info_struct *p,
> > -     struct swap_cluster_info *cluster_info, unsigned long page_nr)
> > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> >  {
> > -     unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > -
> > -     if (!cluster_info)
> > +     if (!p->cluster_info)
> >               return;
> >
> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> > -     cluster_set_count(&cluster_info[idx],
> > -             cluster_count(&cluster_info[idx]) - 1);
> > +     VM_BUG_ON(ci->count == 0);
> > +     ci->count--;
> >
> > -     if (cluster_count(&cluster_info[idx]) == 0)
> > -             free_cluster(p, idx);
> > +     if (!ci->count)
> > +             free_cluster(p, ci);
> >  }
> >
> >  /*
> > @@ -611,10 +499,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
> >  {
> >       struct percpu_cluster *percpu_cluster;
> >       bool conflict;
> > -
> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >       offset /= SWAPFILE_CLUSTER;
> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
> > -             offset != cluster_list_first(&si->free_clusters) &&
> > +     conflict = !list_empty(&si->free_clusters) &&
> > +             offset !=  first - si->cluster_info &&
> >               cluster_is_free(&si->cluster_info[offset]);
> >
> >       if (!conflict)
> > @@ -655,10 +543,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >       cluster = this_cpu_ptr(si->percpu_cluster);
> >       tmp = cluster->next[order];
> >       if (tmp == SWAP_NEXT_INVALID) {
> > -             if (!cluster_list_empty(&si->free_clusters)) {
> > -                     tmp = cluster_next(&si->free_clusters.head) *
> > -                                     SWAPFILE_CLUSTER;
> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
> > +             if (!list_empty(&si->free_clusters)) {
> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> > +                     list_del(&ci->next);
> > +                     spin_lock(&ci->lock);
> > +                     ci->flags = 0;
> > +                     spin_unlock(&ci->lock);
> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> > +             } else if (!list_empty(&si->discard_clusters)) {
> >                       /*
> >                        * we don't have free cluster but have some clusters in
> >                        * discarding, do discard now and reclaim them, then
> > @@ -670,7 +562,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >                       goto new_cluster;
> >               } else
> >                       return false;
> > -     }
> > +     } else
> > +             ci = si->cluster_info + tmp;
> >
> >       /*
> >        * Other CPUs can use our cluster if they can't find a free cluster,
> > @@ -1062,8 +955,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >
> >       ci = lock_cluster(si, offset);
> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> > -     cluster_set_count_flag(ci, 0, 0);
> > -     free_cluster(si, idx);
> > +     ci->count = 0;
> > +     ci->flags = 0;
> > +     free_cluster(si, ci);
> >       unlock_cluster(ci);
> >       swap_range_free(si, offset, SWAPFILE_CLUSTER);
> >  }
> > @@ -1336,7 +1230,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> >       count = p->swap_map[offset];
> >       VM_BUG_ON(count != SWAP_HAS_CACHE);
> >       p->swap_map[offset] = 0;
> > -     dec_cluster_info_page(p, p->cluster_info, offset);
> > +     dec_cluster_info_page(p, ci);
> >       unlock_cluster(ci);
> >
> >       mem_cgroup_uncharge_swap(entry, 1);
> > @@ -2985,8 +2879,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >
> >       nr_good_pages = maxpages - 1;   /* omit header page */
> >
> > -     cluster_list_init(&p->free_clusters);
> > -     cluster_list_init(&p->discard_clusters);
> > +     INIT_LIST_HEAD(&p->free_clusters);
> > +     INIT_LIST_HEAD(&p->discard_clusters);
> >
> >       for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >               unsigned int page_nr = swap_header->info.badpages[i];
> > @@ -3037,14 +2931,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >       for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
> >               j = (k + col) % SWAP_CLUSTER_COLS;
> >               for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> > +                     struct swap_cluster_info *ci;
> >                       idx = i * SWAP_CLUSTER_COLS + j;
> > +                     ci = cluster_info + idx;
> >                       if (idx >= nr_clusters)
> >                               continue;
> > -                     if (cluster_count(&cluster_info[idx]))
> > +                     if (ci->count)
> >                               continue;
> > -                     cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> > -                     cluster_list_add_tail(&p->free_clusters, cluster_info,
> > -                                           idx);
> > +                     ci->flags = CLUSTER_FLAG_FREE;
> > +                     list_add_tail(&ci->next, &p->free_clusters);
> >               }
> >       }
> >       return nr_extents;
>
> --
> Best Regards,
> Huang, Ying
>
Huang, Ying May 31, 2024, 2:03 a.m. UTC | #6
Chris Li <chrisl@kernel.org> writes:

> On Wed, May 29, 2024 at 1:48 AM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Chris Li <chrisl@kernel.org> writes:
>>
>> > Previously, the swap cluster used a cluster index as a pointer
>> > to construct a custom single link list type "swap_cluster_list".
>> > The next cluster pointer is shared with the cluster->count.
>> > The assumption is that only the free cluster needs to be put
>> > on the list.
>> >
>> > That assumption is not true for mTHP allocators any more.
>>
>> I think that the words aren't correct here.  mTHP swap entry allocators
>> can work with current cluster definition.
>
> The current behavior is very problematic though:
>
> If we only allocate and free order 4 swap entries, nothing else. After
> a while, the free cluster will be used up, the swap entry allocation
> will fail even though there is a lot of swap space left.

The original behavior doesn't work well for order-0 allocation too.
percpu_cluster and quick (cluster) scan path cannot be used for
fragmented swap devices.

>> > Need to track the non full cluster on the list as well.  Move the
>> > current cluster single link list into standard double link list.
>>
>> It's an optimization to track non-full cluster with a list.
>>
>> I understand that you want to change cluster list definition.  I just
>
> In my mind, I was changing the list implementation so it can be
> tracked non free cluster as well.
>
>> feel the wording isn't accurate.
>
> Help me improve it. I am happy to adjust the wording in V2, you can
> provide more feedback then.

I suggest you to focus on improvement.  The original implementation
hasn't the assumption that it's the best or perfect.  It improves from
its base and you can continue to improve it for more situations.
Describe the situation where current implementation doesn't performance
well and how do you improve it.  Better with the cost.

>>
>> > Remove the cluster getter/setter for accessing the cluster
>> > struct member.  Move the cluster locking in the caller function
>> > rather than the getter/setter function. That way the locking can
>> > protect more than one member, e.g. cluster->flag.
>>
>> Sorry, I don't understand the locking in above words.  I don't find that
>> we lock/unlock in the original getter/setter functions.  I found that
>> the cluster locking rule for cluster list is changed.  Better to make
>> this explicit.
>
> The original cluster single link list add/remove will require si->lock
> protection as well, because the list head and tail pointer are outside
> of the cluster pointer.
> In this regard, the cluster double link list locking rule is very
> similar. Yes, I move the list_del() outside of the cluster lock, is
> that what you are referring to as the locking change?

In the original implementation, ci->lock is held when changing ci->data
(in fact next here).  Now, you don't need the ci->lock.  This is a
locking rule change, I suggest you to make it explicit in change log and
comments.

>> > Change cluster code to use "struct swap_cluster_info *" to
>> > reference the cluster rather than by using index. That is more
>> > consistent with the list manipulation. It avoids the repeat
>> > adding index to the cluser_info. The code is easier to understand.
>> >
>> > Remove the cluster next pointer is NULL flag, the double link
>> > list can handle the empty list pretty well.
>> >
>> > The "swap_cluster_info" struct is two pointer bigger, because
>> > 512 swap entries share one swap struct, it has very little impact
>> > on the average memory usage per swap entry.  Other than the list
>> > conversion, there is no real function change in this patch.
>>
>> On 64bit platform, the size of swap_cluster_info increases from 8 bytes
>> to 24 bytes.  For a 1TB swap device, the memory usage will increase from
>> 4MB to 12MB.  This looks OK for me.
>
> Will add the size change calculation in V2 and have you review it again.
>
>>
>> Another choice is to use a customized double linked list using "unsigned
>> int" as pointer to cluster.  That will reduce the size of cluster to 16
>> bytes.  But it may be not necessary to do that.
>
> We can always do that as a follow up step to optimize the 24 byte to
> 16 byte, at the price of more code complicity.
> The trick part is the link list head, it is not part of the cluster
> array, it does not have an index, and will need a special handle for
> that.

In theory, you can define a "struct list_u32_head" and a set of
list_u32_* functions.  But I don't find that it's necessary to do that.

>>
>> Anyway, I think that it's better to add more calculation in changelog
>> for memory usage increment.
>
> Sure, I will adjust the commit message in V2.
>
> Chris
>
>>
>> > ---
>> >  include/linux/swap.h |  14 ++--
>> >  mm/swapfile.c        | 231 ++++++++++++++-------------------------------------
>> >  2 files changed, 68 insertions(+), 177 deletions(-)
>> >
>> > diff --git a/include/linux/swap.h b/include/linux/swap.h
>> > index 11c53692f65f..0d3906eff3c9 100644
>> > --- a/include/linux/swap.h
>> > +++ b/include/linux/swap.h
>> > @@ -254,11 +254,12 @@ struct swap_cluster_info {
>> >                                * elements correspond to the swap
>> >                                * cluster
>> >                                */
>> > -     unsigned int data:24;
>> > +     unsigned int count:16;
>> >       unsigned int flags:8;
>>
>> If we use 16bits and 8 bits in bit fields, why not just use u8 and u16
>> instead?
> Not sure about the

?

        u16 count;
        u8 flags;

>>
>> > +     struct list_head next;
>>
>> "next" isn't a good naming because prev pointer is in list_head too.
>> The common naming is "list".
>
> Sure, I can change it to "list".
>
>>
>> Need to revise comments for swap_cluster_info.lock and add the locking
>> rule comments for swap_cluster_info.next.
>
> Will do.
>
>>
>> >  };
>> >  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
>> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
>> > +
>> >

[snip]

--
Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 11c53692f65f..0d3906eff3c9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,11 +254,12 @@  struct swap_cluster_info {
 				 * elements correspond to the swap
 				 * cluster
 				 */
-	unsigned int data:24;
+	unsigned int count:16;
 	unsigned int flags:8;
+	struct list_head next;
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -283,11 +284,6 @@  struct percpu_cluster {
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
-struct swap_cluster_list {
-	struct swap_cluster_info head;
-	struct swap_cluster_info tail;
-};
-
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -300,7 +296,7 @@  struct swap_info_struct {
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
-	struct swap_cluster_list free_clusters; /* free clusters list */
+	struct list_head free_clusters; /* free clusters list */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
@@ -333,7 +329,7 @@  struct swap_info_struct {
 					 * list.
 					 */
 	struct work_struct discard_work; /* discard worker */
-	struct swap_cluster_list discard_clusters; /* discard clusters list */
+	struct list_head discard_clusters; /* discard clusters list */
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
 					   * entry per node.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f0e8b2ac8aa..205a60c5f9cb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -290,64 +290,11 @@  static void discard_swap_cluster(struct swap_info_struct *si,
 #endif
 #define LATENCY_LIMIT		256
 
-static inline void cluster_set_flag(struct swap_cluster_info *info,
-	unsigned int flag)
-{
-	info->flags = flag;
-}
-
-static inline unsigned int cluster_count(struct swap_cluster_info *info)
-{
-	return info->data;
-}
-
-static inline void cluster_set_count(struct swap_cluster_info *info,
-				     unsigned int c)
-{
-	info->data = c;
-}
-
-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
-					 unsigned int c, unsigned int f)
-{
-	info->flags = f;
-	info->data = c;
-}
-
-static inline unsigned int cluster_next(struct swap_cluster_info *info)
-{
-	return info->data;
-}
-
-static inline void cluster_set_next(struct swap_cluster_info *info,
-				    unsigned int n)
-{
-	info->data = n;
-}
-
-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
-					 unsigned int n, unsigned int f)
-{
-	info->flags = f;
-	info->data = n;
-}
-
 static inline bool cluster_is_free(struct swap_cluster_info *info)
 {
 	return info->flags & CLUSTER_FLAG_FREE;
 }
 
-static inline bool cluster_is_null(struct swap_cluster_info *info)
-{
-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
-}
-
-static inline void cluster_set_null(struct swap_cluster_info *info)
-{
-	info->flags = CLUSTER_FLAG_NEXT_NULL;
-	info->data = 0;
-}
-
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 						     unsigned long offset)
 {
@@ -394,65 +341,11 @@  static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 		spin_unlock(&si->lock);
 }
 
-static inline bool cluster_list_empty(struct swap_cluster_list *list)
-{
-	return cluster_is_null(&list->head);
-}
-
-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
-{
-	return cluster_next(&list->head);
-}
-
-static void cluster_list_init(struct swap_cluster_list *list)
-{
-	cluster_set_null(&list->head);
-	cluster_set_null(&list->tail);
-}
-
-static void cluster_list_add_tail(struct swap_cluster_list *list,
-				  struct swap_cluster_info *ci,
-				  unsigned int idx)
-{
-	if (cluster_list_empty(list)) {
-		cluster_set_next_flag(&list->head, idx, 0);
-		cluster_set_next_flag(&list->tail, idx, 0);
-	} else {
-		struct swap_cluster_info *ci_tail;
-		unsigned int tail = cluster_next(&list->tail);
-
-		/*
-		 * Nested cluster lock, but both cluster locks are
-		 * only acquired when we held swap_info_struct->lock
-		 */
-		ci_tail = ci + tail;
-		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
-		cluster_set_next(ci_tail, idx);
-		spin_unlock(&ci_tail->lock);
-		cluster_set_next_flag(&list->tail, idx, 0);
-	}
-}
-
-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
-					   struct swap_cluster_info *ci)
-{
-	unsigned int idx;
-
-	idx = cluster_next(&list->head);
-	if (cluster_next(&list->tail) == idx) {
-		cluster_set_null(&list->head);
-		cluster_set_null(&list->tail);
-	} else
-		cluster_set_next_flag(&list->head,
-				      cluster_next(&ci[idx]), 0);
-
-	return idx;
-}
-
 /* Add a cluster to discard list and schedule it to do discard */
 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
-		unsigned int idx)
+		struct swap_cluster_info *ci)
 {
+	unsigned int idx = ci - si->cluster_info;
 	/*
 	 * If scan_swap_map_slots() can't find a free cluster, it will check
 	 * si->swap_map directly. To make sure the discarding cluster isn't
@@ -462,17 +355,16 @@  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
 
-	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
-
+	spin_lock_nested(&ci->lock, SINGLE_DEPTH_NESTING);
+	list_add_tail(&ci->next, &si->discard_clusters);
+	spin_unlock(&ci->lock);
 	schedule_work(&si->discard_work);
 }
 
-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
-	struct swap_cluster_info *ci = si->cluster_info;
-
-	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
-	cluster_list_add_tail(&si->free_clusters, ci, idx);
+	ci->flags = CLUSTER_FLAG_FREE;
+	list_add_tail(&ci->next, &si->free_clusters);
 }
 
 /*
@@ -481,21 +373,21 @@  static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
 */
 static void swap_do_scheduled_discard(struct swap_info_struct *si)
 {
-	struct swap_cluster_info *info, *ci;
+	struct swap_cluster_info *ci;
 	unsigned int idx;
 
-	info = si->cluster_info;
-
-	while (!cluster_list_empty(&si->discard_clusters)) {
-		idx = cluster_list_del_first(&si->discard_clusters, info);
+	while (!list_empty(&si->discard_clusters)) {
+		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, next);
+		idx = ci - si->cluster_info;
 		spin_unlock(&si->lock);
 
 		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
 				SWAPFILE_CLUSTER);
 
 		spin_lock(&si->lock);
-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
-		__free_cluster(si, idx);
+
+		spin_lock(&ci->lock);
+		__free_cluster(si, ci);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
 		unlock_cluster(ci);
@@ -521,20 +413,20 @@  static void swap_users_ref_free(struct percpu_ref *ref)
 	complete(&si->comp);
 }
 
-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
-	struct swap_cluster_info *ci = si->cluster_info;
+	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
 
-	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
-	cluster_list_del_first(&si->free_clusters, ci);
-	cluster_set_count_flag(ci + idx, 0, 0);
+	VM_BUG_ON(ci - si->cluster_info != idx);
+	list_del(&ci->next);
+	ci->count = 0;
+	ci->flags = 0;
+	return ci;
 }
 
-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
-	struct swap_cluster_info *ci = si->cluster_info + idx;
-
-	VM_BUG_ON(cluster_count(ci) != 0);
+	VM_BUG_ON(ci->count != 0);
 	/*
 	 * If the swap is discardable, prepare discard the cluster
 	 * instead of free it immediately. The cluster will be freed
@@ -542,11 +434,11 @@  static void free_cluster(struct swap_info_struct *si, unsigned long idx)
 	 */
 	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
 	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
-		swap_cluster_schedule_discard(si, idx);
+		swap_cluster_schedule_discard(si, ci);
 		return;
 	}
 
-	__free_cluster(si, idx);
+	__free_cluster(si, ci);
 }
 
 /*
@@ -559,15 +451,15 @@  static void add_cluster_info_page(struct swap_info_struct *p,
 	unsigned long count)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+	struct swap_cluster_info *ci = cluster_info + idx;
 
 	if (!cluster_info)
 		return;
-	if (cluster_is_free(&cluster_info[idx]))
+	if (cluster_is_free(ci))
 		alloc_cluster(p, idx);
 
-	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
-	cluster_set_count(&cluster_info[idx],
-		cluster_count(&cluster_info[idx]) + count);
+	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
+	ci->count += count;
 }
 
 /*
@@ -581,24 +473,20 @@  static void inc_cluster_info_page(struct swap_info_struct *p,
 }
 
 /*
- * The cluster corresponding to page_nr decreases one usage. If the usage
- * counter becomes 0, which means no page in the cluster is in using, we can
- * optionally discard the cluster and add it to free cluster list.
+ * The cluster ci decreases one usage. If the usage counter becomes 0,
+ * which means no page in the cluster is in using, we can optionally discard
+ * the cluster and add it to free cluster list.
  */
-static void dec_cluster_info_page(struct swap_info_struct *p,
-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
 {
-	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
-
-	if (!cluster_info)
+	if (!p->cluster_info)
 		return;
 
-	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
-	cluster_set_count(&cluster_info[idx],
-		cluster_count(&cluster_info[idx]) - 1);
+	VM_BUG_ON(ci->count == 0);
+	ci->count--;
 
-	if (cluster_count(&cluster_info[idx]) == 0)
-		free_cluster(p, idx);
+	if (!ci->count)
+		free_cluster(p, ci);
 }
 
 /*
@@ -611,10 +499,10 @@  scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 {
 	struct percpu_cluster *percpu_cluster;
 	bool conflict;
-
+	struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
 	offset /= SWAPFILE_CLUSTER;
-	conflict = !cluster_list_empty(&si->free_clusters) &&
-		offset != cluster_list_first(&si->free_clusters) &&
+	conflict = !list_empty(&si->free_clusters) &&
+		offset !=  first - si->cluster_info &&
 		cluster_is_free(&si->cluster_info[offset]);
 
 	if (!conflict)
@@ -655,10 +543,14 @@  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	cluster = this_cpu_ptr(si->percpu_cluster);
 	tmp = cluster->next[order];
 	if (tmp == SWAP_NEXT_INVALID) {
-		if (!cluster_list_empty(&si->free_clusters)) {
-			tmp = cluster_next(&si->free_clusters.head) *
-					SWAPFILE_CLUSTER;
-		} else if (!cluster_list_empty(&si->discard_clusters)) {
+		if (!list_empty(&si->free_clusters)) {
+			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
+			list_del(&ci->next);
+			spin_lock(&ci->lock);
+			ci->flags = 0;
+			spin_unlock(&ci->lock);
+			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
+		} else if (!list_empty(&si->discard_clusters)) {
 			/*
 			 * we don't have free cluster but have some clusters in
 			 * discarding, do discard now and reclaim them, then
@@ -670,7 +562,8 @@  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 			goto new_cluster;
 		} else
 			return false;
-	}
+	} else
+		ci = si->cluster_info + tmp;
 
 	/*
 	 * Other CPUs can use our cluster if they can't find a free cluster,
@@ -1062,8 +955,9 @@  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 
 	ci = lock_cluster(si, offset);
 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
-	cluster_set_count_flag(ci, 0, 0);
-	free_cluster(si, idx);
+	ci->count = 0;
+	ci->flags = 0;
+	free_cluster(si, ci);
 	unlock_cluster(ci);
 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
@@ -1336,7 +1230,7 @@  static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
 	count = p->swap_map[offset];
 	VM_BUG_ON(count != SWAP_HAS_CACHE);
 	p->swap_map[offset] = 0;
-	dec_cluster_info_page(p, p->cluster_info, offset);
+	dec_cluster_info_page(p, ci);
 	unlock_cluster(ci);
 
 	mem_cgroup_uncharge_swap(entry, 1);
@@ -2985,8 +2879,8 @@  static int setup_swap_map_and_extents(struct swap_info_struct *p,
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
-	cluster_list_init(&p->free_clusters);
-	cluster_list_init(&p->discard_clusters);
+	INIT_LIST_HEAD(&p->free_clusters);
+	INIT_LIST_HEAD(&p->discard_clusters);
 
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
@@ -3037,14 +2931,15 @@  static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
 		j = (k + col) % SWAP_CLUSTER_COLS;
 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+			struct swap_cluster_info *ci;
 			idx = i * SWAP_CLUSTER_COLS + j;
+			ci = cluster_info + idx;
 			if (idx >= nr_clusters)
 				continue;
-			if (cluster_count(&cluster_info[idx]))
+			if (ci->count)
 				continue;
-			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-			cluster_list_add_tail(&p->free_clusters, cluster_info,
-					      idx);
+			ci->flags = CLUSTER_FLAG_FREE;
+			list_add_tail(&ci->next, &p->free_clusters);
 		}
 	}
 	return nr_extents;