diff mbox series

[2/2] mm: swap: mTHP allocate swap entries from nonfull list

Message ID 20240524-swap-allocator-v1-2-47861b423b26@kernel.org (mailing list archive)
State New
Headers show
Series mm: swap: mTHP swap allocator base on swap cluster order | expand

Commit Message

Chris Li May 24, 2024, 5:17 p.m. UTC
Track the nonfull cluster as well as the empty cluster
on lists. Each order has one nonfull cluster list.

The cluster will remember which order it was used during
new cluster allocation.

When the cluster has free entry, add to the nonfull[order]
list.  When the free cluster list is empty, also allocate
from the nonempty list of that order.

This improves the mTHP swap allocation success rate.

There are limitations if the distribution of numbers of
different orders of mTHP changes a lot. e.g. there are a lot
of nonfull cluster assign to order A while later time there
are a lot of order B allocation while very little allocation
in order A. Currently the cluster used by order A will not
reused by order B unless the cluster is 100% empty.

This situation is best addressed by the longer term "swap
buddy allocator", in future patches.
---
 include/linux/swap.h |  4 ++++
 mm/swapfile.c        | 25 +++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

Comments

Ryan Roberts June 7, 2024, 10:35 a.m. UTC | #1
On 24/05/2024 18:17, Chris Li wrote:
> Track the nonfull cluster as well as the empty cluster
> on lists. Each order has one nonfull cluster list.
> 
> The cluster will remember which order it was used during
> new cluster allocation.
> 
> When the cluster has free entry, add to the nonfull[order]
> list.  When the free cluster list is empty, also allocate
> from the nonempty list of that order.
> 
> This improves the mTHP swap allocation success rate.

If I've understood correctly, the aim here is to link all the current per-cpu
clusters for a given order together so that if a cpu can't allocate a new
cluster for a given order, then it can steal another CPU's current cluster for
that order?

If that's the intent, couldn't that be done just by iterating over the per-cpu,
per-order cluster pointers? Then you don't need all the linked list churn
(althogh I like the linked list changes as a nice cleanup, I'm not sure the
churn is neccessary for this change?). There would likely need to be some
locking considerations, but it would also allow you to get access to the next
entry within the cluster for allocation.

However, fundamentally, I don't think this change solves the problem; it just
takes a bit longer before the allocation fails. The real problem is
fragmentation due to freeing individual pages from swap entries at different times.

Wouldn't it be better to just extend scanning to support high order allocations?
Then we can steal a high order block from any cluster, even clusters that were
previously full, just like we currently do for order-0. Given we are already
falling back to this path for order-0, I don't think it would be any more
expensive; infact its less expensive because we only scan once for the high
order block, rather than scan for every split order-0 page.

Of course that still doesn't solve the proplem entirely; if swap is so
fragmented that there is no contiguous block of the required order then you
still have to fall back to splitting. As an extra optimization, you could store
the largest contiguous free space available in each cluster to avoid scanning in
case its too small?


> 
> There are limitations if the distribution of numbers of
> different orders of mTHP changes a lot. e.g. there are a lot
> of nonfull cluster assign to order A while later time there
> are a lot of order B allocation while very little allocation
> in order A. Currently the cluster used by order A will not
> reused by order B unless the cluster is 100% empty.
> 
> This situation is best addressed by the longer term "swap
> buddy allocator", in future patches.
> ---
>  include/linux/swap.h |  4 ++++
>  mm/swapfile.c        | 25 +++++++++++++++++++++++--
>  2 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 0d3906eff3c9..1b7f0794b9bf 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -255,10 +255,12 @@ struct swap_cluster_info {
>  				 * cluster
>  				 */
>  	unsigned int count:16;
> +	unsigned int order:8;
>  	unsigned int flags:8;
>  	struct list_head next;
>  };
>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
>  
>  
>  /*
> @@ -297,6 +299,8 @@ struct swap_info_struct {
>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
>  	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>  	struct list_head free_clusters; /* free clusters list */
> +	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
> +					/* list of cluster that contains at least one free slot */
>  	unsigned int lowest_bit;	/* index of first free in swap_map */
>  	unsigned int highest_bit;	/* index of last free in swap_map */
>  	unsigned int pages;		/* total of usable pages of swap */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 205a60c5f9cb..51923aba500e 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  
>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> +	if (ci->flags & CLUSTER_FLAG_NONFULL)
> +		list_move_tail(&ci->next, &si->free_clusters);
> +	else
> +		list_add_tail(&ci->next, &si->free_clusters);
>  	ci->flags = CLUSTER_FLAG_FREE;
> -	list_add_tail(&ci->next, &si->free_clusters);
>  }
>  
>  /*
> @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
>  	ci->count--;
>  
>  	if (!ci->count)
> -		free_cluster(p, ci);
> +		return free_cluster(p, ci);
> +
> +	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
> +		list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
> +		ci->flags |= CLUSTER_FLAG_NONFULL;
> +	}
>  }
>  
>  /*
> @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>  			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>  			list_del(&ci->next);
>  			spin_lock(&ci->lock);
> +			ci->order = order;
> +			ci->flags = 0;
> +			spin_unlock(&ci->lock);
> +			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> +		} else if (!list_empty(&si->nonfull_clusters[order])) {
> +			ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
> +			list_del(&ci->next);
> +			spin_lock(&ci->lock);
>  			ci->flags = 0;
>  			spin_unlock(&ci->lock);
>  			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;

This looks wrong to me; if the cluster is on the nonfull list then it will have
had some entries already allocated (by another cpu). So pointing tmp to the
first block in the cluster will never yield a free block. The cpu from which you
are stealing the cluster stores the next free block location in its per-cpu
structure. So perhaps iterating over the other cpu's `struct percpu_cluster`s is
a better approach than the nonfull list?

Additionally, this cluster will be stored back to this cpu's current cluster at
the bottom of the function. That may or may not be what you intended.

> @@ -578,6 +594,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>  				break;
>  			tmp += nr_pages;
>  		}
> +		WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
>  		unlock_cluster(ci);
>  	}
>  	if (tmp >= max) {
> @@ -956,6 +973,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>  	ci = lock_cluster(si, offset);
>  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>  	ci->count = 0;
> +	ci->order = 0;
>  	ci->flags = 0;
>  	free_cluster(si, ci);
>  	unlock_cluster(ci);
> @@ -2882,6 +2900,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  	INIT_LIST_HEAD(&p->free_clusters);
>  	INIT_LIST_HEAD(&p->discard_clusters);
>  
> +	for (i = 0; i < SWAP_NR_ORDERS; i++)
> +		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
> +
>  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
>  		unsigned int page_nr = swap_header->info.badpages[i];
>  		if (page_nr == 0 || page_nr > swap_header->info.last_page)
>
Ryan Roberts June 7, 2024, 10:57 a.m. UTC | #2
On 07/06/2024 11:35, Ryan Roberts wrote:
> On 24/05/2024 18:17, Chris Li wrote:
>> Track the nonfull cluster as well as the empty cluster
>> on lists. Each order has one nonfull cluster list.
>>
>> The cluster will remember which order it was used during
>> new cluster allocation.
>>
>> When the cluster has free entry, add to the nonfull[order]
>> list.  When the free cluster list is empty, also allocate
>> from the nonempty list of that order.
>>
>> This improves the mTHP swap allocation success rate.
> 
> If I've understood correctly, the aim here is to link all the current per-cpu
> clusters for a given order together so that if a cpu can't allocate a new
> cluster for a given order, then it can steal another CPU's current cluster for
> that order?
> 
> If that's the intent, couldn't that be done just by iterating over the per-cpu,
> per-order cluster pointers? Then you don't need all the linked list churn
> (althogh I like the linked list changes as a nice cleanup, I'm not sure the
> churn is neccessary for this change?). There would likely need to be some
> locking considerations, but it would also allow you to get access to the next
> entry within the cluster for allocation.
> 
> However, fundamentally, I don't think this change solves the problem; it just
> takes a bit longer before the allocation fails. The real problem is
> fragmentation due to freeing individual pages from swap entries at different times.
> 
> Wouldn't it be better to just extend scanning to support high order allocations?
> Then we can steal a high order block from any cluster, even clusters that were
> previously full, just like we currently do for order-0. Given we are already
> falling back to this path for order-0, I don't think it would be any more
> expensive; infact its less expensive because we only scan once for the high
> order block, rather than scan for every split order-0 page.
> 
> Of course that still doesn't solve the proplem entirely; if swap is so
> fragmented that there is no contiguous block of the required order then you
> still have to fall back to splitting. As an extra optimization, you could store
> the largest contiguous free space available in each cluster to avoid scanning in
> case its too small?
> 
> 
>>
>> There are limitations if the distribution of numbers of
>> different orders of mTHP changes a lot. e.g. there are a lot
>> of nonfull cluster assign to order A while later time there
>> are a lot of order B allocation while very little allocation
>> in order A. Currently the cluster used by order A will not
>> reused by order B unless the cluster is 100% empty.
>>
>> This situation is best addressed by the longer term "swap
>> buddy allocator", in future patches.
>> ---
>>  include/linux/swap.h |  4 ++++
>>  mm/swapfile.c        | 25 +++++++++++++++++++++++--
>>  2 files changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 0d3906eff3c9..1b7f0794b9bf 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -255,10 +255,12 @@ struct swap_cluster_info {
>>  				 * cluster
>>  				 */
>>  	unsigned int count:16;
>> +	unsigned int order:8;
>>  	unsigned int flags:8;
>>  	struct list_head next;
>>  };
>>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
>> +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
>>  
>>  
>>  /*
>> @@ -297,6 +299,8 @@ struct swap_info_struct {
>>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
>>  	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>>  	struct list_head free_clusters; /* free clusters list */
>> +	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
>> +					/* list of cluster that contains at least one free slot */
>>  	unsigned int lowest_bit;	/* index of first free in swap_map */
>>  	unsigned int highest_bit;	/* index of last free in swap_map */
>>  	unsigned int pages;		/* total of usable pages of swap */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 205a60c5f9cb..51923aba500e 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>>  
>>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>>  {
>> +	if (ci->flags & CLUSTER_FLAG_NONFULL)
>> +		list_move_tail(&ci->next, &si->free_clusters);
>> +	else
>> +		list_add_tail(&ci->next, &si->free_clusters);
>>  	ci->flags = CLUSTER_FLAG_FREE;
>> -	list_add_tail(&ci->next, &si->free_clusters);
>>  }
>>  
>>  /*
>> @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
>>  	ci->count--;
>>  
>>  	if (!ci->count)
>> -		free_cluster(p, ci);
>> +		return free_cluster(p, ci);
>> +
>> +	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
>> +		list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
>> +		ci->flags |= CLUSTER_FLAG_NONFULL;
>> +	}
>>  }
>>  
>>  /*
>> @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>>  			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>>  			list_del(&ci->next);
>>  			spin_lock(&ci->lock);
>> +			ci->order = order;
>> +			ci->flags = 0;
>> +			spin_unlock(&ci->lock);
>> +			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>> +		} else if (!list_empty(&si->nonfull_clusters[order])) {
>> +			ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
>> +			list_del(&ci->next);
>> +			spin_lock(&ci->lock);
>>  			ci->flags = 0;
>>  			spin_unlock(&ci->lock);
>>  			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> 
> This looks wrong to me; if the cluster is on the nonfull list then it will have
> had some entries already allocated (by another cpu). So pointing tmp to the
> first block in the cluster will never yield a free block. The cpu from which you
> are stealing the cluster stores the next free block location in its per-cpu
> structure. So perhaps iterating over the other cpu's `struct percpu_cluster`s is
> a better approach than the nonfull list?

Ahh; of course the cluster scan below will move this along to a free block.

> 
> Additionally, this cluster will be stored back to this cpu's current cluster at
> the bottom of the function. That may or may not be what you intended.
> 
>> @@ -578,6 +594,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>>  				break;
>>  			tmp += nr_pages;
>>  		}
>> +		WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
>>  		unlock_cluster(ci);
>>  	}
>>  	if (tmp >= max) {
>> @@ -956,6 +973,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>>  	ci = lock_cluster(si, offset);
>>  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>>  	ci->count = 0;
>> +	ci->order = 0;
>>  	ci->flags = 0;
>>  	free_cluster(si, ci);
>>  	unlock_cluster(ci);
>> @@ -2882,6 +2900,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>>  	INIT_LIST_HEAD(&p->free_clusters);
>>  	INIT_LIST_HEAD(&p->discard_clusters);
>>  
>> +	for (i = 0; i < SWAP_NR_ORDERS; i++)
>> +		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
>> +
>>  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
>>  		unsigned int page_nr = swap_header->info.badpages[i];
>>  		if (page_nr == 0 || page_nr > swap_header->info.last_page)
>>
>
Chris Li June 7, 2024, 8:52 p.m. UTC | #3
On Fri, Jun 7, 2024 at 3:35 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 24/05/2024 18:17, Chris Li wrote:
> > Track the nonfull cluster as well as the empty cluster
> > on lists. Each order has one nonfull cluster list.
> >
> > The cluster will remember which order it was used during
> > new cluster allocation.
> >
> > When the cluster has free entry, add to the nonfull[order]
> > list.  When the free cluster list is empty, also allocate
> > from the nonempty list of that order.
> >
> > This improves the mTHP swap allocation success rate.
>
> If I've understood correctly, the aim here is to link all the current per-cpu
> clusters for a given order together so that if a cpu can't allocate a new
> cluster for a given order, then it can steal another CPU's current cluster for
> that order?

Stealing other CPU's *current* cluster is not the intent. The intent
is after all current per-cpu done with this cluster(full), those full
clusters are not tracked by any per-cpu struct. When those full
clusters become non-full. Track it in the global nonfull cluster list.
The per-cpu allocation can take a cluster from that nonfull cluster
list and start allocating from it.

The V1 code does not specifically check for the stealing behavior, the
V2 code will prevent that from happening. Basically each cluster has 4
states and owners:
1) empty, owned by a global free cluster list.
2) per cpu allocating. owned by per CPU current.
3) nonfull (also non empty). own global nonfull list.
4) full, currently not tracked, we can track it under global full list.

When the per cpu runs out of free cluster, it can take a cluster from
3) and move it to 2).

>
> If that's the intent, couldn't that be done just by iterating over the per-cpu,
> per-order cluster pointers? Then you don't need all the linked list churn

Again, that is not the intent.

> (althogh I like the linked list changes as a nice cleanup, I'm not sure the
> churn is neccessary for this change?). There would likely need to be some
> locking considerations, but it would also allow you to get access to the next
> entry within the cluster for allocation.
>
> However, fundamentally, I don't think this change solves the problem; it just
> takes a bit longer before the allocation fails. The real problem is
> fragmentation due to freeing individual pages from swap entries at different times.

It definitely helps to find nonfull clusters quicker. Please take a
look at my above comment and read the patch again.

>
> Wouldn't it be better to just extend scanning to support high order allocations?
> Then we can steal a high order block from any cluster, even clusters that were

Steal from higher order causes the higher order harder to allocate,
that is downside.
In my mind, ideally have some high order cluster reservation scheme so
the high order one doesn't mix with the low order one.

> previously full, just like we currently do for order-0. Given we are already
> falling back to this path for order-0, I don't think it would be any more
> expensive; infact its less expensive because we only scan once for the high
> order block, rather than scan for every split order-0 page.
>
> Of course that still doesn't solve the proplem entirely; if swap is so
> fragmented that there is no contiguous block of the required order then you
> still have to fall back to splitting. As an extra optimization, you could store

Exactly. That is why I think some high order cluster reservation
scheme is needed for a short term solution.
The change itself is not too complicated if we can agree on this approach.

> the largest contiguous free space available in each cluster to avoid scanning in
> case its too small?

Avoid scanning does just get to the non available high order result quicker.
Does not seem to help increase the high order allocation success rate.

>
>
> >
> > There are limitations if the distribution of numbers of
> > different orders of mTHP changes a lot. e.g. there are a lot
> > of nonfull cluster assign to order A while later time there
> > are a lot of order B allocation while very little allocation
> > in order A. Currently the cluster used by order A will not
> > reused by order B unless the cluster is 100% empty.
> >
> > This situation is best addressed by the longer term "swap
> > buddy allocator", in future patches.
> > ---
> >  include/linux/swap.h |  4 ++++
> >  mm/swapfile.c        | 25 +++++++++++++++++++++++--
> >  2 files changed, 27 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 0d3906eff3c9..1b7f0794b9bf 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -255,10 +255,12 @@ struct swap_cluster_info {
> >                                * cluster
> >                                */
> >       unsigned int count:16;
> > +     unsigned int order:8;
> >       unsigned int flags:8;
> >       struct list_head next;
> >  };
> >  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> > +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
> >
> >
> >  /*
> > @@ -297,6 +299,8 @@ struct swap_info_struct {
> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> >       struct list_head free_clusters; /* free clusters list */
> > +     struct list_head nonfull_clusters[SWAP_NR_ORDERS];
> > +                                     /* list of cluster that contains at least one free slot */
> >       unsigned int lowest_bit;        /* index of first free in swap_map */
> >       unsigned int highest_bit;       /* index of last free in swap_map */
> >       unsigned int pages;             /* total of usable pages of swap */
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 205a60c5f9cb..51923aba500e 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >
> >  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > +     if (ci->flags & CLUSTER_FLAG_NONFULL)
> > +             list_move_tail(&ci->next, &si->free_clusters);
> > +     else
> > +             list_add_tail(&ci->next, &si->free_clusters);
> >       ci->flags = CLUSTER_FLAG_FREE;
> > -     list_add_tail(&ci->next, &si->free_clusters);
> >  }
> >
> >  /*
> > @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
> >       ci->count--;
> >
> >       if (!ci->count)
> > -             free_cluster(p, ci);
> > +             return free_cluster(p, ci);
> > +
> > +     if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
> > +             list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
> > +             ci->flags |= CLUSTER_FLAG_NONFULL;
> > +     }
> >  }
> >
> >  /*
> > @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >                       list_del(&ci->next);
> >                       spin_lock(&ci->lock);
> > +                     ci->order = order;
> > +                     ci->flags = 0;
> > +                     spin_unlock(&ci->lock);
> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> > +             } else if (!list_empty(&si->nonfull_clusters[order])) {
> > +                     ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
> > +                     list_del(&ci->next);
> > +                     spin_lock(&ci->lock);
> >                       ci->flags = 0;
> >                       spin_unlock(&ci->lock);
> >                       tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>
> This looks wrong to me; if the cluster is on the nonfull list then it will have
> had some entries already allocated (by another cpu). So pointing tmp to the
> first block in the cluster will never yield a free block. The cpu from which you

I believe it will scan until it finds a free block with alignment down
in the offset < max loop.

while (offset < max) {
    if (swap_range_empty(si->swap_map, offset, nr_pages))
        break;
    offset += nr_pages;
}

> are stealing the cluster stores the next free block location in its per-cpu
> structure. So perhaps iterating over the other cpu's `struct percpu_cluster`s is
> a better approach than the nonfull list?

No, stealing is not the intent. The intent is  quickly finding the non
full cluster NOT in other per cpu allocation.

>
> Additionally, this cluster will be stored back to this cpu's current cluster at
> the bottom of the function. That may or may not be what you intended.

That is what I intended. It remembers the current allocating cluster,
in case this cluster has more than one high order swap entries.

Chris

>
> > @@ -578,6 +594,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >                               break;
> >                       tmp += nr_pages;
> >               }
> > +             WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
> >               unlock_cluster(ci);
> >       }
> >       if (tmp >= max) {
> > @@ -956,6 +973,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >       ci = lock_cluster(si, offset);
> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> >       ci->count = 0;
> > +     ci->order = 0;
> >       ci->flags = 0;
> >       free_cluster(si, ci);
> >       unlock_cluster(ci);
> > @@ -2882,6 +2900,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >       INIT_LIST_HEAD(&p->free_clusters);
> >       INIT_LIST_HEAD(&p->discard_clusters);
> >
> > +     for (i = 0; i < SWAP_NR_ORDERS; i++)
> > +             INIT_LIST_HEAD(&p->nonfull_clusters[i]);
> > +
> >       for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >               unsigned int page_nr = swap_header->info.badpages[i];
> >               if (page_nr == 0 || page_nr > swap_header->info.last_page)
> >
>
Chris Li June 7, 2024, 8:53 p.m. UTC | #4
On Fri, Jun 7, 2024 at 3:57 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 07/06/2024 11:35, Ryan Roberts wrote:
> > On 24/05/2024 18:17, Chris Li wrote:
> >> Track the nonfull cluster as well as the empty cluster
> >> on lists. Each order has one nonfull cluster list.
> >>
> >> The cluster will remember which order it was used during
> >> new cluster allocation.
> >>
> >> When the cluster has free entry, add to the nonfull[order]
> >> list.  When the free cluster list is empty, also allocate
> >> from the nonempty list of that order.
> >>
> >> This improves the mTHP swap allocation success rate.
> >
> > If I've understood correctly, the aim here is to link all the current per-cpu
> > clusters for a given order together so that if a cpu can't allocate a new
> > cluster for a given order, then it can steal another CPU's current cluster for
> > that order?
> >
> > If that's the intent, couldn't that be done just by iterating over the per-cpu,
> > per-order cluster pointers? Then you don't need all the linked list churn
> > (althogh I like the linked list changes as a nice cleanup, I'm not sure the
> > churn is neccessary for this change?). There would likely need to be some
> > locking considerations, but it would also allow you to get access to the next
> > entry within the cluster for allocation.
> >
> > However, fundamentally, I don't think this change solves the problem; it just
> > takes a bit longer before the allocation fails. The real problem is
> > fragmentation due to freeing individual pages from swap entries at different times.
> >
> > Wouldn't it be better to just extend scanning to support high order allocations?
> > Then we can steal a high order block from any cluster, even clusters that were
> > previously full, just like we currently do for order-0. Given we are already
> > falling back to this path for order-0, I don't think it would be any more
> > expensive; infact its less expensive because we only scan once for the high
> > order block, rather than scan for every split order-0 page.
> >
> > Of course that still doesn't solve the proplem entirely; if swap is so
> > fragmented that there is no contiguous block of the required order then you
> > still have to fall back to splitting. As an extra optimization, you could store
> > the largest contiguous free space available in each cluster to avoid scanning in
> > case its too small?
> >
> >
> >>
> >> There are limitations if the distribution of numbers of
> >> different orders of mTHP changes a lot. e.g. there are a lot
> >> of nonfull cluster assign to order A while later time there
> >> are a lot of order B allocation while very little allocation
> >> in order A. Currently the cluster used by order A will not
> >> reused by order B unless the cluster is 100% empty.
> >>
> >> This situation is best addressed by the longer term "swap
> >> buddy allocator", in future patches.
> >> ---
> >>  include/linux/swap.h |  4 ++++
> >>  mm/swapfile.c        | 25 +++++++++++++++++++++++--
> >>  2 files changed, 27 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> index 0d3906eff3c9..1b7f0794b9bf 100644
> >> --- a/include/linux/swap.h
> >> +++ b/include/linux/swap.h
> >> @@ -255,10 +255,12 @@ struct swap_cluster_info {
> >>                               * cluster
> >>                               */
> >>      unsigned int count:16;
> >> +    unsigned int order:8;
> >>      unsigned int flags:8;
> >>      struct list_head next;
> >>  };
> >>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> >> +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
> >>
> >>
> >>  /*
> >> @@ -297,6 +299,8 @@ struct swap_info_struct {
> >>      unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >>      struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> >>      struct list_head free_clusters; /* free clusters list */
> >> +    struct list_head nonfull_clusters[SWAP_NR_ORDERS];
> >> +                                    /* list of cluster that contains at least one free slot */
> >>      unsigned int lowest_bit;        /* index of first free in swap_map */
> >>      unsigned int highest_bit;       /* index of last free in swap_map */
> >>      unsigned int pages;             /* total of usable pages of swap */
> >> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> index 205a60c5f9cb..51923aba500e 100644
> >> --- a/mm/swapfile.c
> >> +++ b/mm/swapfile.c
> >> @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >>
> >>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >>  {
> >> +    if (ci->flags & CLUSTER_FLAG_NONFULL)
> >> +            list_move_tail(&ci->next, &si->free_clusters);
> >> +    else
> >> +            list_add_tail(&ci->next, &si->free_clusters);
> >>      ci->flags = CLUSTER_FLAG_FREE;
> >> -    list_add_tail(&ci->next, &si->free_clusters);
> >>  }
> >>
> >>  /*
> >> @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
> >>      ci->count--;
> >>
> >>      if (!ci->count)
> >> -            free_cluster(p, ci);
> >> +            return free_cluster(p, ci);
> >> +
> >> +    if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
> >> +            list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
> >> +            ci->flags |= CLUSTER_FLAG_NONFULL;
> >> +    }
> >>  }
> >>
> >>  /*
> >> @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >>                      ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >>                      list_del(&ci->next);
> >>                      spin_lock(&ci->lock);
> >> +                    ci->order = order;
> >> +                    ci->flags = 0;
> >> +                    spin_unlock(&ci->lock);
> >> +                    tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> >> +            } else if (!list_empty(&si->nonfull_clusters[order])) {
> >> +                    ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
> >> +                    list_del(&ci->next);
> >> +                    spin_lock(&ci->lock);
> >>                      ci->flags = 0;
> >>                      spin_unlock(&ci->lock);
> >>                      tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> >
> > This looks wrong to me; if the cluster is on the nonfull list then it will have
> > had some entries already allocated (by another cpu). So pointing tmp to the
> > first block in the cluster will never yield a free block. The cpu from which you
> > are stealing the cluster stores the next free block location in its per-cpu
> > structure. So perhaps iterating over the other cpu's `struct percpu_cluster`s is
> > a better approach than the nonfull list?
>
> Ahh; of course the cluster scan below will move this along to a free block.

You mean the (offset < max) loop, right?

Agree.

Chris

>
> >
> > Additionally, this cluster will be stored back to this cpu's current cluster at
> > the bottom of the function. That may or may not be what you intended.
> >
> >> @@ -578,6 +594,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >>                              break;
> >>                      tmp += nr_pages;
> >>              }
> >> +            WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
> >>              unlock_cluster(ci);
> >>      }
> >>      if (tmp >= max) {
> >> @@ -956,6 +973,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >>      ci = lock_cluster(si, offset);
> >>      memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> >>      ci->count = 0;
> >> +    ci->order = 0;
> >>      ci->flags = 0;
> >>      free_cluster(si, ci);
> >>      unlock_cluster(ci);
> >> @@ -2882,6 +2900,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >>      INIT_LIST_HEAD(&p->free_clusters);
> >>      INIT_LIST_HEAD(&p->discard_clusters);
> >>
> >> +    for (i = 0; i < SWAP_NR_ORDERS; i++)
> >> +            INIT_LIST_HEAD(&p->nonfull_clusters[i]);
> >> +
> >>      for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >>              unsigned int page_nr = swap_header->info.badpages[i];
> >>              if (page_nr == 0 || page_nr > swap_header->info.last_page)
> >>
> >
>
Ryan Roberts June 10, 2024, 11:18 a.m. UTC | #5
On 07/06/2024 21:52, Chris Li wrote:
> On Fri, Jun 7, 2024 at 3:35 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 24/05/2024 18:17, Chris Li wrote:
>>> Track the nonfull cluster as well as the empty cluster
>>> on lists. Each order has one nonfull cluster list.
>>>
>>> The cluster will remember which order it was used during
>>> new cluster allocation.
>>>
>>> When the cluster has free entry, add to the nonfull[order]
>>> list.  When the free cluster list is empty, also allocate
>>> from the nonempty list of that order.
>>>
>>> This improves the mTHP swap allocation success rate.
>>
>> If I've understood correctly, the aim here is to link all the current per-cpu
>> clusters for a given order together so that if a cpu can't allocate a new
>> cluster for a given order, then it can steal another CPU's current cluster for
>> that order?
> 
> Stealing other CPU's *current* cluster is not the intent. The intent
> is after all current per-cpu done with this cluster(full), those full
> clusters are not tracked by any per-cpu struct. When those full
> clusters become non-full. Track it in the global nonfull cluster list.
> The per-cpu allocation can take a cluster from that nonfull cluster
> list and start allocating from it.
> 
> The V1 code does not specifically check for the stealing behavior, the
> V2 code will prevent that from happening. Basically each cluster has 4
> states and owners:
> 1) empty, owned by a global free cluster list.
> 2) per cpu allocating. owned by per CPU current.
> 3) nonfull (also non empty). own global nonfull list.
> 4) full, currently not tracked, we can track it under global full list.
> 
> When the per cpu runs out of free cluster, it can take a cluster from
> 3) and move it to 2).

OK, sorry for my misunderstanding, and thanks for the explanaiton. I've taken a
proper look at the patch and with this explanation now understand the intent.

I guess in effect, this is a scanning approach, but you are limiting the
clusters that you scan to those that were originally allocated for the required
order and which are known to have some free space.

I guess this will work more effectively with Barry's series that swaps in a
whole large folio in one go, because it is more likely that holes of the
required size will appear in the non-full clusters. But previous discussions
concluded that it was not always going to be the right approach to swap-in large
folios in one go (certainly not for higer orders). So I don't think you can
(yet) rely on swap slots being freed as order-sized blocks. That said I still
think that your approach should improve the situation even without Barry's series.

In fact, why don't you put the cluster on the non-full list at cluster
allocation time? Then you can also allow a cpu to steal from another cpu's
current cluster (as I initially thought you were doing). I think this should
work with pretty much the same logic? And improve chances of allocation without
increasing chances of fragmentation? (more on this below).

> 
>>
>> If that's the intent, couldn't that be done just by iterating over the per-cpu,
>> per-order cluster pointers? Then you don't need all the linked list churn
> 
> Again, that is not the intent.
> 
>> (althogh I like the linked list changes as a nice cleanup, I'm not sure the
>> churn is neccessary for this change?). There would likely need to be some
>> locking considerations, but it would also allow you to get access to the next
>> entry within the cluster for allocation.
>>
>> However, fundamentally, I don't think this change solves the problem; it just
>> takes a bit longer before the allocation fails. The real problem is
>> fragmentation due to freeing individual pages from swap entries at different times.
> 
> It definitely helps to find nonfull clusters quicker. Please take a
> look at my above comment and read the patch again.
> 
>>
>> Wouldn't it be better to just extend scanning to support high order allocations?
>> Then we can steal a high order block from any cluster, even clusters that were
> 
> Steal from higher order causes the higher order harder to allocate,
> that is downside.
> In my mind, ideally have some high order cluster reservation scheme so
> the high order one doesn't mix with the low order one.

Yes, that would make sense; you could limit the number of clusters allocated for
each order at any given time.

Order-0 stealing will still cause problems. You could probably just remove that
and limit order-0 scanning/stealing to clusters that were originally allocated
for order-0 too, using the same logic.

> 
>> previously full, just like we currently do for order-0. Given we are already
>> falling back to this path for order-0, I don't think it would be any more
>> expensive; infact its less expensive because we only scan once for the high
>> order block, rather than scan for every split order-0 page.
>>
>> Of course that still doesn't solve the proplem entirely; if swap is so
>> fragmented that there is no contiguous block of the required order then you
>> still have to fall back to splitting. As an extra optimization, you could store
> 
> Exactly. That is why I think some high order cluster reservation
> scheme is needed for a short term solution.
> The change itself is not too complicated if we can agree on this approach.
> 
>> the largest contiguous free space available in each cluster to avoid scanning in
>> case its too small?
> 
> Avoid scanning does just get to the non available high order result quicker.
> Does not seem to help increase the high order allocation success rate.
> 
>>
>>
>>>
>>> There are limitations if the distribution of numbers of
>>> different orders of mTHP changes a lot. e.g. there are a lot
>>> of nonfull cluster assign to order A while later time there
>>> are a lot of order B allocation while very little allocation
>>> in order A. Currently the cluster used by order A will not
>>> reused by order B unless the cluster is 100% empty.
>>>
>>> This situation is best addressed by the longer term "swap
>>> buddy allocator", in future patches.
>>> ---
>>>  include/linux/swap.h |  4 ++++
>>>  mm/swapfile.c        | 25 +++++++++++++++++++++++--
>>>  2 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 0d3906eff3c9..1b7f0794b9bf 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -255,10 +255,12 @@ struct swap_cluster_info {
>>>                                * cluster
>>>                                */
>>>       unsigned int count:16;
>>> +     unsigned int order:8;
>>>       unsigned int flags:8;
>>>       struct list_head next;
>>>  };
>>>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
>>> +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
>>>
>>>
>>>  /*
>>> @@ -297,6 +299,8 @@ struct swap_info_struct {
>>>       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
>>>       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>>>       struct list_head free_clusters; /* free clusters list */
>>> +     struct list_head nonfull_clusters[SWAP_NR_ORDERS];
>>> +                                     /* list of cluster that contains at least one free slot */
>>>       unsigned int lowest_bit;        /* index of first free in swap_map */
>>>       unsigned int highest_bit;       /* index of last free in swap_map */
>>>       unsigned int pages;             /* total of usable pages of swap */
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 205a60c5f9cb..51923aba500e 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>>>
>>>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>>>  {
>>> +     if (ci->flags & CLUSTER_FLAG_NONFULL)
>>> +             list_move_tail(&ci->next, &si->free_clusters);
>>> +     else
>>> +             list_add_tail(&ci->next, &si->free_clusters);
>>>       ci->flags = CLUSTER_FLAG_FREE;
>>> -     list_add_tail(&ci->next, &si->free_clusters);
>>>  }
>>>
>>>  /*
>>> @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
>>>       ci->count--;
>>>
>>>       if (!ci->count)
>>> -             free_cluster(p, ci);
>>> +             return free_cluster(p, ci);
>>> +
>>> +     if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
>>> +             list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
>>> +             ci->flags |= CLUSTER_FLAG_NONFULL;
>>> +     }
>>>  }
>>>
>>>  /*
>>> @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>>>                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
>>>                       list_del(&ci->next);
>>>                       spin_lock(&ci->lock);
>>> +                     ci->order = order;
>>> +                     ci->flags = 0;
>>> +                     spin_unlock(&ci->lock);
>>> +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>>> +             } else if (!list_empty(&si->nonfull_clusters[order])) {

You are preferring to scan the nonfull clusters over doing discard and
allocating a newly freed cluster; wouldn't it be better to prefer discard over
nonfull scanning?

>>> +                     ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
>>> +                     list_del(&ci->next);

I'm struggling a bit with what the value of the nonfull_clusters linked list is.
I wonder if it would be simpler to just track the number of free slots in the
cluster and iterate over the clusters, scanning the ones with the desired order
and which have at least (1 << order) free slots? I guess this list is giving you
an ordering such that the cluster you pull off the list first had its first slot
freed longest ago so it is most likely to have most space?

>>> +                     spin_lock(&ci->lock);
>>>                       ci->flags = 0;
>>>                       spin_unlock(&ci->lock);
>>>                       tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>>
>> This looks wrong to me; if the cluster is on the nonfull list then it will have
>> had some entries already allocated (by another cpu). So pointing tmp to the
>> first block in the cluster will never yield a free block. The cpu from which you
> 
> I believe it will scan until it finds a free block with alignment down
> in the offset < max loop.

Yes agreed, my bad. Sorry about that.

Thanks,
Ryan

> 
> while (offset < max) {
>     if (swap_range_empty(si->swap_map, offset, nr_pages))
>         break;
>     offset += nr_pages;
> }
> 
>> are stealing the cluster stores the next free block location in its per-cpu
>> structure. So perhaps iterating over the other cpu's `struct percpu_cluster`s is
>> a better approach than the nonfull list?
> 
> No, stealing is not the intent. The intent is  quickly finding the non
> full cluster NOT in other per cpu allocation.
> 
>>
>> Additionally, this cluster will be stored back to this cpu's current cluster at
>> the bottom of the function. That may or may not be what you intended.
> 
> That is what I intended. It remembers the current allocating cluster,
> in case this cluster has more than one high order swap entries.
> 
> Chris
> 
>>
>>> @@ -578,6 +594,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>>>                               break;
>>>                       tmp += nr_pages;
>>>               }
>>> +             WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
>>>               unlock_cluster(ci);
>>>       }
>>>       if (tmp >= max) {
>>> @@ -956,6 +973,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>>>       ci = lock_cluster(si, offset);
>>>       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>>>       ci->count = 0;
>>> +     ci->order = 0;
>>>       ci->flags = 0;
>>>       free_cluster(si, ci);
>>>       unlock_cluster(ci);
>>> @@ -2882,6 +2900,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>>>       INIT_LIST_HEAD(&p->free_clusters);
>>>       INIT_LIST_HEAD(&p->discard_clusters);
>>>
>>> +     for (i = 0; i < SWAP_NR_ORDERS; i++)
>>> +             INIT_LIST_HEAD(&p->nonfull_clusters[i]);
>>> +
>>>       for (i = 0; i < swap_header->info.nr_badpages; i++) {
>>>               unsigned int page_nr = swap_header->info.badpages[i];
>>>               if (page_nr == 0 || page_nr > swap_header->info.last_page)
>>>
>>
Chris Li June 11, 2024, 6:09 a.m. UTC | #6
On Mon, Jun 10, 2024 at 4:18 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 07/06/2024 21:52, Chris Li wrote:
> > On Fri, Jun 7, 2024 at 3:35 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> > Stealing other CPU's *current* cluster is not the intent. The intent
> > is after all current per-cpu done with this cluster(full), those full
> > clusters are not tracked by any per-cpu struct. When those full
> > clusters become non-full. Track it in the global nonfull cluster list.
> > The per-cpu allocation can take a cluster from that nonfull cluster
> > list and start allocating from it.
> >
> > The V1 code does not specifically check for the stealing behavior, the
> > V2 code will prevent that from happening. Basically each cluster has 4
> > states and owners:
> > 1) empty, owned by a global free cluster list.
> > 2) per cpu allocating. owned by per CPU current.
> > 3) nonfull (also non empty). own global nonfull list.
> > 4) full, currently not tracked, we can track it under global full list.
> >
> > When the per cpu runs out of free cluster, it can take a cluster from
> > 3) and move it to 2).
>
> OK, sorry for my misunderstanding, and thanks for the explanaiton. I've taken a
> proper look at the patch and with this explanation now understand the intent.
>
> I guess in effect, this is a scanning approach, but you are limiting the
> clusters that you scan to those that were originally allocated for the required
> order and which are known to have some free space.

Yes, not only some free space. When we swap in the same size as the
swap out size, we can have the cluster dedicated to that order. Avoid
mixing order in that cluster.

>
> I guess this will work more effectively with Barry's series that swaps in a
> whole large folio in one go, because it is more likely that holes of the
> required size will appear in the non-full clusters. But previous discussions

Ack.

> concluded that it was not always going to be the right approach to swap-in large
> folios in one go (certainly not for higer orders). So I don't think you can

We need to start from somewhere. Right now it is still too early to
say which approach will win out. I think it is beneficial to try out
swapin the same size as swap out, and see how the data play out. 4K is
too small and 2M is too big. Maybe one of the mTHP size would be the
sweet spot.

> (yet) rely on swap slots being freed as order-sized blocks. That said I still
> think that your approach should improve the situation even without Barry's series.

Agree.

> In fact, why don't you put the cluster on the non-full list at cluster
> allocation time? Then you can also allow a cpu to steal from another cpu's

That would be the reservation approach. If we know how much swap space
we want for that order in advance. We can make the reservation at swap
on time, then that order will only allocate from the reserved cluster.

> current cluster (as I initially thought you were doing). I think this should

For stealing from other cpu's current order cluster, we can have a
global list of clusters for per_cpu[order] for the cluster. It's
better to track it separately as the other nonfull cluster. We only
steal from other CPU when we run out of global nonfull clusters. The
current patch allows stealing from other CPU as well. Just might
happen before it runs out of the nonfull cluster. Might not be too big
a deal.

> work with pretty much the same logic? And improve chances of allocation without
> increasing chances of fragmentation? (more on this below).
>
> >
> >>
> >> If that's the intent, couldn't that be done just by iterating over the per-cpu,
> >> per-order cluster pointers? Then you don't need all the linked list churn
> >
> > Again, that is not the intent.
> >
> >> (althogh I like the linked list changes as a nice cleanup, I'm not sure the
> >> churn is neccessary for this change?). There would likely need to be some
> >> locking considerations, but it would also allow you to get access to the next
> >> entry within the cluster for allocation.
> >>
> >> However, fundamentally, I don't think this change solves the problem; it just
> >> takes a bit longer before the allocation fails. The real problem is
> >> fragmentation due to freeing individual pages from swap entries at different times.
> >
> > It definitely helps to find nonfull clusters quicker. Please take a
> > look at my above comment and read the patch again.
> >
> >>
> >> Wouldn't it be better to just extend scanning to support high order allocations?
> >> Then we can steal a high order block from any cluster, even clusters that were
> >
> > Steal from higher order causes the higher order harder to allocate,
> > that is downside.
> > In my mind, ideally have some high order cluster reservation scheme so
> > the high order one doesn't mix with the low order one.
>
> Yes, that would make sense; you could limit the number of clusters allocated for
> each order at any given time.
>
> Order-0 stealing will still cause problems. You could probably just remove that
> and limit order-0 scanning/stealing to clusters that were originally allocated
> for order-0 too, using the same logic.

Yes, that is the plan.

If we reserve some space for higher order, then order-0 can't steal
from the reserved higher order cluster.

>
> >
> >> previously full, just like we currently do for order-0. Given we are already
> >> falling back to this path for order-0, I don't think it would be any more
> >> expensive; infact its less expensive because we only scan once for the high
> >> order block, rather than scan for every split order-0 page.
> >>
> >> Of course that still doesn't solve the proplem entirely; if swap is so
> >> fragmented that there is no contiguous block of the required order then you
> >> still have to fall back to splitting. As an extra optimization, you could store
> >
> > Exactly. That is why I think some high order cluster reservation
> > scheme is needed for a short term solution.
> > The change itself is not too complicated if we can agree on this approach.
> >
> >> the largest contiguous free space available in each cluster to avoid scanning in
> >> case its too small?
> >
> > Avoid scanning does just get to the non available high order result quicker.
> > Does not seem to help increase the high order allocation success rate.
> >
> >>
> >>
> >>>
> >>> There are limitations if the distribution of numbers of
> >>> different orders of mTHP changes a lot. e.g. there are a lot
> >>> of nonfull cluster assign to order A while later time there
> >>> are a lot of order B allocation while very little allocation
> >>> in order A. Currently the cluster used by order A will not
> >>> reused by order B unless the cluster is 100% empty.
> >>>
> >>> This situation is best addressed by the longer term "swap
> >>> buddy allocator", in future patches.
> >>> ---
> >>>  include/linux/swap.h |  4 ++++
> >>>  mm/swapfile.c        | 25 +++++++++++++++++++++++--
> >>>  2 files changed, 27 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>> index 0d3906eff3c9..1b7f0794b9bf 100644
> >>> --- a/include/linux/swap.h
> >>> +++ b/include/linux/swap.h
> >>> @@ -255,10 +255,12 @@ struct swap_cluster_info {
> >>>                                * cluster
> >>>                                */
> >>>       unsigned int count:16;
> >>> +     unsigned int order:8;
> >>>       unsigned int flags:8;
> >>>       struct list_head next;
> >>>  };
> >>>  #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> >>> +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
> >>>
> >>>
> >>>  /*
> >>> @@ -297,6 +299,8 @@ struct swap_info_struct {
> >>>       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >>>       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> >>>       struct list_head free_clusters; /* free clusters list */
> >>> +     struct list_head nonfull_clusters[SWAP_NR_ORDERS];
> >>> +                                     /* list of cluster that contains at least one free slot */
> >>>       unsigned int lowest_bit;        /* index of first free in swap_map */
> >>>       unsigned int highest_bit;       /* index of last free in swap_map */
> >>>       unsigned int pages;             /* total of usable pages of swap */
> >>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >>> index 205a60c5f9cb..51923aba500e 100644
> >>> --- a/mm/swapfile.c
> >>> +++ b/mm/swapfile.c
> >>> @@ -363,8 +363,11 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >>>
> >>>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >>>  {
> >>> +     if (ci->flags & CLUSTER_FLAG_NONFULL)
> >>> +             list_move_tail(&ci->next, &si->free_clusters);
> >>> +     else
> >>> +             list_add_tail(&ci->next, &si->free_clusters);
> >>>       ci->flags = CLUSTER_FLAG_FREE;
> >>> -     list_add_tail(&ci->next, &si->free_clusters);
> >>>  }
> >>>
> >>>  /*
> >>> @@ -486,7 +489,12 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
> >>>       ci->count--;
> >>>
> >>>       if (!ci->count)
> >>> -             free_cluster(p, ci);
> >>> +             return free_cluster(p, ci);
> >>> +
> >>> +     if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
> >>> +             list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
> >>> +             ci->flags |= CLUSTER_FLAG_NONFULL;
> >>> +     }
> >>>  }
> >>>
> >>>  /*
> >>> @@ -547,6 +555,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >>>                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
> >>>                       list_del(&ci->next);
> >>>                       spin_lock(&ci->lock);
> >>> +                     ci->order = order;
> >>> +                     ci->flags = 0;
> >>> +                     spin_unlock(&ci->lock);
> >>> +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> >>> +             } else if (!list_empty(&si->nonfull_clusters[order])) {
>
> You are preferring to scan the nonfull clusters over doing discard and
> allocating a newly freed cluster; wouldn't it be better to prefer discard over
> nonfull scanning?

My consideration is that issuing a discard command takes some IO time.
If we have an alternative path to move forward, e.g. using another
nonfull cluster, we should do that to avoid latency penalty.

>
> >>> +                     ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
> >>> +                     list_del(&ci->next);
>
> I'm struggling a bit with what the value of the nonfull_clusters linked list is.

Assume we have the swap in size the same as swap out size. The
nonfull_clusters will get to the cluster in the desired order quicker.

> I wonder if it would be simpler to just track the number of free slots in the
> cluster and iterate over the clusters, scanning the ones with the desired order
> and which have at least (1 << order) free slots? I guess this list is giving you

That would be slower than this approach. We would likely repeatedly
scan over a lot of lower order clusters which can't find a big enough
free range. We can cache it and speed it up by avoiding repeat
scanning the unfitting cluster over and over again, that effectively
turns into a buddy allocator approach then. It can work but is more
complex.

> an ordering such that the cluster you pull off the list first had its first slot
> freed longest ago so it is most likely to have most space?

Yes, that is one of the considerations as well.

Chris
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0d3906eff3c9..1b7f0794b9bf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -255,10 +255,12 @@  struct swap_cluster_info {
 				 * cluster
 				 */
 	unsigned int count:16;
+	unsigned int order:8;
 	unsigned int flags:8;
 	struct list_head next;
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
 
 
 /*
@@ -297,6 +299,8 @@  struct swap_info_struct {
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
+	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+					/* list of cluster that contains at least one free slot */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 205a60c5f9cb..51923aba500e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -363,8 +363,11 @@  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 
 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
+	if (ci->flags & CLUSTER_FLAG_NONFULL)
+		list_move_tail(&ci->next, &si->free_clusters);
+	else
+		list_add_tail(&ci->next, &si->free_clusters);
 	ci->flags = CLUSTER_FLAG_FREE;
-	list_add_tail(&ci->next, &si->free_clusters);
 }
 
 /*
@@ -486,7 +489,12 @@  static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
 	ci->count--;
 
 	if (!ci->count)
-		free_cluster(p, ci);
+		return free_cluster(p, ci);
+
+	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+		list_add_tail(&ci->next, &p->nonfull_clusters[ci->order]);
+		ci->flags |= CLUSTER_FLAG_NONFULL;
+	}
 }
 
 /*
@@ -547,6 +555,14 @@  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, next);
 			list_del(&ci->next);
 			spin_lock(&ci->lock);
+			ci->order = order;
+			ci->flags = 0;
+			spin_unlock(&ci->lock);
+			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
+		} else if (!list_empty(&si->nonfull_clusters[order])) {
+			ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, next);
+			list_del(&ci->next);
+			spin_lock(&ci->lock);
 			ci->flags = 0;
 			spin_unlock(&ci->lock);
 			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
@@ -578,6 +594,7 @@  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 				break;
 			tmp += nr_pages;
 		}
+		WARN_ONCE(ci->order != order, "expecting order %d got %d", order, ci->order);
 		unlock_cluster(ci);
 	}
 	if (tmp >= max) {
@@ -956,6 +973,7 @@  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 	ci = lock_cluster(si, offset);
 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
 	ci->count = 0;
+	ci->order = 0;
 	ci->flags = 0;
 	free_cluster(si, ci);
 	unlock_cluster(ci);
@@ -2882,6 +2900,9 @@  static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	INIT_LIST_HEAD(&p->free_clusters);
 	INIT_LIST_HEAD(&p->discard_clusters);
 
+	for (i = 0; i < SWAP_NR_ORDERS; i++)
+		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
+
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 		if (page_nr == 0 || page_nr > swap_header->info.last_page)