diff mbox series

[5/7] mm, swap: use percpu cluster as allocation fast path

Message ID 20250214175709.76029-6-ryncsn@gmail.com (mailing list archive)
State New
Headers show
Series mm, swap: remove swap slot cache | expand

Commit Message

Kairui Song Feb. 14, 2025, 5:57 p.m. UTC
From: Kairui Song <kasong@tencent.com>

Current allocation workflow first traverses the plist with a global lock
held, after choosing a device, it uses the percpu cluster on that swap
device. This commit moves the percpu cluster variable out of being tied
to individual swap devices, making it a global percpu variable, and will
be used directly for allocation as a fast path.

The global percpu cluster variable will never point to a HDD device, and
allocation on HDD devices is still globally serialized.

This improves the allocator performance and prepares for removal of the
slot cache in later commits. There shouldn't be much observable behavior
change, except one thing: this changes how swap device allocation
rotation works.

Currently, each allocation will rotate the plist, and because of the
existence of slot cache (64 entries), swap devices of the same priority
are rotated for every 64 entries consumed. And, high order allocations
are different, they will bypass the slot cache, and so swap device is
rotated for every 16K, 32K, or up to 2M allocation.

The rotation rule was never clearly defined or documented, it was changed
several times without mentioning too.

After this commit, once slot cache is gone in later commits, swap device
rotation will happen for every consumed cluster. Ideally non-HDD devices
will be rotated if 2M space has been consumed for each order, this seems
reasonable. HDD devices is rotated for every allocation regardless of the
allocation order, which should be OK and trivial.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 include/linux/swap.h |  11 ++--
 mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
 2 files changed, 79 insertions(+), 52 deletions(-)

Comments

Baoquan He Feb. 19, 2025, 7:53 a.m. UTC | #1
On 02/15/25 at 01:57am, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Current allocation workflow first traverses the plist with a global lock
> held, after choosing a device, it uses the percpu cluster on that swap
> device. This commit moves the percpu cluster variable out of being tied
> to individual swap devices, making it a global percpu variable, and will
> be used directly for allocation as a fast path.
> 
> The global percpu cluster variable will never point to a HDD device, and
> allocation on HDD devices is still globally serialized.
> 
> This improves the allocator performance and prepares for removal of the
> slot cache in later commits. There shouldn't be much observable behavior
> change, except one thing: this changes how swap device allocation
> rotation works.
> 
> Currently, each allocation will rotate the plist, and because of the
> existence of slot cache (64 entries), swap devices of the same priority
> are rotated for every 64 entries consumed. And, high order allocations
> are different, they will bypass the slot cache, and so swap device is
> rotated for every 16K, 32K, or up to 2M allocation.
> 
> The rotation rule was never clearly defined or documented, it was changed
> several times without mentioning too.
> 
> After this commit, once slot cache is gone in later commits, swap device
> rotation will happen for every consumed cluster. Ideally non-HDD devices
> will be rotated if 2M space has been consumed for each order, this seems

This breaks the rule where the high priority swap device is always taken
to allocate as long as there's free space in the device. After this patch,
it will try the percpu cluster firstly which is lower priority even though
the higher priority device has free space. However, this only happens when
the higher priority device is exhausted, not a generic case. If this is
expected, it may need be mentioned in log or doc somewhere at least.

> reasonable. HDD devices is rotated for every allocation regardless of the
> allocation order, which should be OK and trivial.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  include/linux/swap.h |  11 ++--
>  mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
>  2 files changed, 79 insertions(+), 52 deletions(-)
......
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ae3bd0a862fc..791cd7ed5bdf 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
>  
......snip....
>  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  {
>  	int order = swap_entry_order(entry_order);
> @@ -1211,19 +1251,28 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  	int n_ret = 0;
>  	int node;
>  
> +	/* Fast path using percpu cluster */
> +	local_lock(&percpu_swap_cluster.lock);
> +	n_ret = swap_alloc_fast(swp_entries,
> +				SWAP_HAS_CACHE,
> +				order, n_goal);
> +	if (n_ret == n_goal)
> +		goto out;
> +
> +	n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);

Here, the behaviour is changed too. In old allocation, partial
allocation will jump out to return. In this patch, you try the percpu
cluster firstly, then call scan_swap_map_slots() to try best and will
jump out even though partial allocation succeed. But the allocation from
scan_swap_map_slots() could happen on different si device, this looks
bizarre. Do you think we need reconsider the design?

> +	/* Rotate the device and switch to a new cluster */
>  	spin_lock(&swap_avail_lock);
>  start_over:
>  	node = numa_node_id();
>  	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
> -		/* requeue si to after same-priority siblings */
>  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
>  		spin_unlock(&swap_avail_lock);
>  		if (get_swap_device_info(si)) {
> -			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
> -					n_goal, swp_entries, order);
> +			n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
> +					swp_entries + n_ret, order);
>  			put_swap_device(si);
>  			if (n_ret || size > 1)
> -				goto check_out;
> +				goto out;
>  		}
>  
>  		spin_lock(&swap_avail_lock);
> @@ -1241,12 +1290,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  		if (plist_node_empty(&next->avail_lists[node]))
>  			goto start_over;
>  	}
> -
>  	spin_unlock(&swap_avail_lock);
> -
> -check_out:
> +out:
> +	local_unlock(&percpu_swap_cluster.lock);
>  	atomic_long_sub(n_ret * size, &nr_swap_pages);
> -
>  	return n_ret;
>  }
>  
......snip...
Kairui Song Feb. 19, 2025, 8:34 a.m. UTC | #2
On Wed, Feb 19, 2025 at 3:54 PM Baoquan He <bhe@redhat.com> wrote:

Hi Baoquan,

Thanks for the review!

>
> On 02/15/25 at 01:57am, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Current allocation workflow first traverses the plist with a global lock
> > held, after choosing a device, it uses the percpu cluster on that swap
> > device. This commit moves the percpu cluster variable out of being tied
> > to individual swap devices, making it a global percpu variable, and will
> > be used directly for allocation as a fast path.
> >
> > The global percpu cluster variable will never point to a HDD device, and
> > allocation on HDD devices is still globally serialized.
> >
> > This improves the allocator performance and prepares for removal of the
> > slot cache in later commits. There shouldn't be much observable behavior
> > change, except one thing: this changes how swap device allocation
> > rotation works.
> >
> > Currently, each allocation will rotate the plist, and because of the
> > existence of slot cache (64 entries), swap devices of the same priority
> > are rotated for every 64 entries consumed. And, high order allocations
> > are different, they will bypass the slot cache, and so swap device is
> > rotated for every 16K, 32K, or up to 2M allocation.
> >
> > The rotation rule was never clearly defined or documented, it was changed
> > several times without mentioning too.
> >
> > After this commit, once slot cache is gone in later commits, swap device
> > rotation will happen for every consumed cluster. Ideally non-HDD devices
> > will be rotated if 2M space has been consumed for each order, this seems
>
> This breaks the rule where the high priority swap device is always taken
> to allocate as long as there's free space in the device. After this patch,
> it will try the percpu cluster firstly which is lower priority even though
> the higher priority device has free space. However, this only happens when
> the higher priority device is exhausted, not a generic case. If this is
> expected, it may need be mentioned in log or doc somewhere at least.

Hmm, actually this rule was already broken if you are very strict
about it. The current percpu slot cache does a pre-allocation, so the
high priority device will be removed from the plist while some CPU's
slot cache holding usable entries.

If the high priority device is exhausted, some CPU's percpu cluster
will point to a low priority device indeed, and keep using it until
the percpu cluster is drained. I think this should be
OK. The high priority device is already full, so the amount of
swapouts falls back to low priority device is only a performance
issue, I think it's a tiny change for a rare case.

>
> > reasonable. HDD devices is rotated for every allocation regardless of the
> > allocation order, which should be OK and trivial.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  include/linux/swap.h |  11 ++--
> >  mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
> >  2 files changed, 79 insertions(+), 52 deletions(-)
> ......
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index ae3bd0a862fc..791cd7ed5bdf 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
> >
> ......snip....
> >  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> >  {
> >       int order = swap_entry_order(entry_order);
> > @@ -1211,19 +1251,28 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> >       int n_ret = 0;
> >       int node;
> >
> > +     /* Fast path using percpu cluster */
> > +     local_lock(&percpu_swap_cluster.lock);
> > +     n_ret = swap_alloc_fast(swp_entries,
> > +                             SWAP_HAS_CACHE,
> > +                             order, n_goal);
> > +     if (n_ret == n_goal)
> > +             goto out;
> > +
> > +     n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
>
> Here, the behaviour is changed too. In old allocation, partial
> allocation will jump out to return. In this patch, you try the percpu
> cluster firstly, then call scan_swap_map_slots() to try best and will
> jump out even though partial allocation succeed. But the allocation from
> scan_swap_map_slots() could happen on different si device, this looks
> bizarre. Do you think we need reconsider the design?

Right, that's a behavior change, but only temporarily affects slot cache.
get_swap_pages will only be called with size > 1 when order == 0, and
only by slot cache. (Large order allocation always use size == 1,
other users only uses order == 0 && size == 1). So I didn't' notice
it, as this series is removing slot cache.

The partial side effect would be "returned slots will be from
different devices" and "slot_cache may get drained faster as
get_swap_pages may return less slots when percpu cluster is drained".
Might be a performance issue but seems slight and trivial, slot cache
can still work. And the next commit will just remove the slot cache,
and the problem will be gone. I think I can add a comment about it
here?
Baoquan He Feb. 19, 2025, 9:26 a.m. UTC | #3
On 02/19/25 at 04:34pm, Kairui Song wrote:
> On Wed, Feb 19, 2025 at 3:54 PM Baoquan He <bhe@redhat.com> wrote:
> 
> Hi Baoquan,
> 
> Thanks for the review!
> 
> >
> > On 02/15/25 at 01:57am, Kairui Song wrote:
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Current allocation workflow first traverses the plist with a global lock
> > > held, after choosing a device, it uses the percpu cluster on that swap
> > > device. This commit moves the percpu cluster variable out of being tied
> > > to individual swap devices, making it a global percpu variable, and will
> > > be used directly for allocation as a fast path.
> > >
> > > The global percpu cluster variable will never point to a HDD device, and
> > > allocation on HDD devices is still globally serialized.
> > >
> > > This improves the allocator performance and prepares for removal of the
> > > slot cache in later commits. There shouldn't be much observable behavior
> > > change, except one thing: this changes how swap device allocation
> > > rotation works.
> > >
> > > Currently, each allocation will rotate the plist, and because of the
> > > existence of slot cache (64 entries), swap devices of the same priority
> > > are rotated for every 64 entries consumed. And, high order allocations
> > > are different, they will bypass the slot cache, and so swap device is
> > > rotated for every 16K, 32K, or up to 2M allocation.
> > >
> > > The rotation rule was never clearly defined or documented, it was changed
> > > several times without mentioning too.
> > >
> > > After this commit, once slot cache is gone in later commits, swap device
> > > rotation will happen for every consumed cluster. Ideally non-HDD devices
> > > will be rotated if 2M space has been consumed for each order, this seems
> >
> > This breaks the rule where the high priority swap device is always taken
> > to allocate as long as there's free space in the device. After this patch,
> > it will try the percpu cluster firstly which is lower priority even though
> > the higher priority device has free space. However, this only happens when
> > the higher priority device is exhausted, not a generic case. If this is
> > expected, it may need be mentioned in log or doc somewhere at least.
> 
> Hmm, actually this rule was already broken if you are very strict
> about it. The current percpu slot cache does a pre-allocation, so the
> high priority device will be removed from the plist while some CPU's
> slot cache holding usable entries.

Ah, right, I didn't think about this point.

> 
> If the high priority device is exhausted, some CPU's percpu cluster
> will point to a low priority device indeed, and keep using it until
> the percpu cluster is drained. I think this should be
> OK. The high priority device is already full, so the amount of
> swapouts falls back to low priority device is only a performance
> issue, I think it's a tiny change for a rare case.

Agree, thanks for explanation.

> 
> >
> > > reasonable. HDD devices is rotated for every allocation regardless of the
> > > allocation order, which should be OK and trivial.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  include/linux/swap.h |  11 ++--
> > >  mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
> > >  2 files changed, 79 insertions(+), 52 deletions(-)
> > ......
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index ae3bd0a862fc..791cd7ed5bdf 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
> > >
> > ......snip....
> > >  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > >  {
> > >       int order = swap_entry_order(entry_order);
> > > @@ -1211,19 +1251,28 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > >       int n_ret = 0;
> > >       int node;
> > >
> > > +     /* Fast path using percpu cluster */
> > > +     local_lock(&percpu_swap_cluster.lock);
> > > +     n_ret = swap_alloc_fast(swp_entries,
> > > +                             SWAP_HAS_CACHE,
> > > +                             order, n_goal);
> > > +     if (n_ret == n_goal)
> > > +             goto out;
> > > +
> > > +     n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
> >
> > Here, the behaviour is changed too. In old allocation, partial
> > allocation will jump out to return. In this patch, you try the percpu
> > cluster firstly, then call scan_swap_map_slots() to try best and will
> > jump out even though partial allocation succeed. But the allocation from
> > scan_swap_map_slots() could happen on different si device, this looks
> > bizarre. Do you think we need reconsider the design?
> 
> Right, that's a behavior change, but only temporarily affects slot cache.
> get_swap_pages will only be called with size > 1 when order == 0, and
> only by slot cache. (Large order allocation always use size == 1,
> other users only uses order == 0 && size == 1). So I didn't' notice
> it, as this series is removing slot cache.

Right, I am reviewing patch 6, noticed this is temporary.

> 
> The partial side effect would be "returned slots will be from
> different devices" and "slot_cache may get drained faster as
> get_swap_pages may return less slots when percpu cluster is drained".
> Might be a performance issue but seems slight and trivial, slot cache
> can still work. And the next commit will just remove the slot cache,
> and the problem will be gone. I think I can add a comment about it
> here?

Sounds good. Adding comment can avoid other people being confused when
checking patch 5. Thanks.
Baoquan He Feb. 19, 2025, 10:55 a.m. UTC | #4
On 02/19/25 at 04:34pm, Kairui Song wrote:
> On Wed, Feb 19, 2025 at 3:54 PM Baoquan He <bhe@redhat.com> wrote:
> 
> Hi Baoquan,
> 
> Thanks for the review!
> 
> >
> > On 02/15/25 at 01:57am, Kairui Song wrote:
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Current allocation workflow first traverses the plist with a global lock
> > > held, after choosing a device, it uses the percpu cluster on that swap
> > > device. This commit moves the percpu cluster variable out of being tied
> > > to individual swap devices, making it a global percpu variable, and will
> > > be used directly for allocation as a fast path.
> > >
> > > The global percpu cluster variable will never point to a HDD device, and
> > > allocation on HDD devices is still globally serialized.
> > >
> > > This improves the allocator performance and prepares for removal of the
> > > slot cache in later commits. There shouldn't be much observable behavior
> > > change, except one thing: this changes how swap device allocation
> > > rotation works.
> > >
> > > Currently, each allocation will rotate the plist, and because of the
> > > existence of slot cache (64 entries), swap devices of the same priority
> > > are rotated for every 64 entries consumed. And, high order allocations
> > > are different, they will bypass the slot cache, and so swap device is
> > > rotated for every 16K, 32K, or up to 2M allocation.
> > >
> > > The rotation rule was never clearly defined or documented, it was changed
> > > several times without mentioning too.
> > >
> > > After this commit, once slot cache is gone in later commits, swap device
> > > rotation will happen for every consumed cluster. Ideally non-HDD devices
> > > will be rotated if 2M space has been consumed for each order, this seems
> >
> > This breaks the rule where the high priority swap device is always taken
> > to allocate as long as there's free space in the device. After this patch,
> > it will try the percpu cluster firstly which is lower priority even though
> > the higher priority device has free space. However, this only happens when
> > the higher priority device is exhausted, not a generic case. If this is
> > expected, it may need be mentioned in log or doc somewhere at least.
> 
> Hmm, actually this rule was already broken if you are very strict
> about it. The current percpu slot cache does a pre-allocation, so the
> high priority device will be removed from the plist while some CPU's
> slot cache holding usable entries.
> 
> If the high priority device is exhausted, some CPU's percpu cluster
> will point to a low priority device indeed, and keep using it until
> the percpu cluster is drained. I think this should be
> OK. The high priority device is already full, so the amount of
> swapouts falls back to low priority device is only a performance
> issue, I think it's a tiny change for a rare case.
> 
> >
> > > reasonable. HDD devices is rotated for every allocation regardless of the
> > > allocation order, which should be OK and trivial.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  include/linux/swap.h |  11 ++--
> > >  mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
> > >  2 files changed, 79 insertions(+), 52 deletions(-)
> > ......
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index ae3bd0a862fc..791cd7ed5bdf 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
> > >
> > ......snip....
> > >  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > >  {
> > >       int order = swap_entry_order(entry_order);
> > > @@ -1211,19 +1251,28 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > >       int n_ret = 0;
> > >       int node;
> > >
> > > +     /* Fast path using percpu cluster */
> > > +     local_lock(&percpu_swap_cluster.lock);
> > > +     n_ret = swap_alloc_fast(swp_entries,
> > > +                             SWAP_HAS_CACHE,
> > > +                             order, n_goal);
> > > +     if (n_ret == n_goal)
> > > +             goto out;
> > > +
> > > +     n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
> >
> > Here, the behaviour is changed too. In old allocation, partial
> > allocation will jump out to return. In this patch, you try the percpu
> > cluster firstly, then call scan_swap_map_slots() to try best and will
> > jump out even though partial allocation succeed. But the allocation from
> > scan_swap_map_slots() could happen on different si device, this looks
> > bizarre. Do you think we need reconsider the design?
> 
> Right, that's a behavior change, but only temporarily affects slot cache.
> get_swap_pages will only be called with size > 1 when order == 0, and
> only by slot cache. (Large order allocation always use size == 1,
> other users only uses order == 0 && size == 1). So I didn't' notice
> it, as this series is removing slot cache.
> 
> The partial side effect would be "returned slots will be from
> different devices" and "slot_cache may get drained faster as
> get_swap_pages may return less slots when percpu cluster is drained".
> Might be a performance issue but seems slight and trivial, slot cache
> can still work. And the next commit will just remove the slot cache,
> and the problem will be gone. I think I can add a comment about it
> here?

By the way, another thing I suddenly think of is the percpu cluster
becoming glober over all devices. If one order on the stored si of
percpu cluster is used up, then the whole percpu cluster is drained and
rewritten. Won't this impact performance compared with the old embedding
percpu cluster in each si? In log you said "Ideally non-HDD devices
will be rotated if 2M space has been consumed for each order, this seems
reasonable.", while in reality it may be very difficult to achieve the
'each 2M space has been consumed for each order', but more often happen
when very few of order's space has been consumed, then rewrite percpu.
Wonder what I have missed about this point.

>
Kairui Song Feb. 19, 2025, 11:12 a.m. UTC | #5
On Wed, Feb 19, 2025 at 6:55 PM Baoquan He <bhe@redhat.com> wrote:
>
> On 02/19/25 at 04:34pm, Kairui Song wrote:
> > On Wed, Feb 19, 2025 at 3:54 PM Baoquan He <bhe@redhat.com> wrote:
> >
> > Hi Baoquan,
> >
> > Thanks for the review!
> >
> > >
> > > On 02/15/25 at 01:57am, Kairui Song wrote:
> > > > From: Kairui Song <kasong@tencent.com>
> > > >
> > > > Current allocation workflow first traverses the plist with a global lock
> > > > held, after choosing a device, it uses the percpu cluster on that swap
> > > > device. This commit moves the percpu cluster variable out of being tied
> > > > to individual swap devices, making it a global percpu variable, and will
> > > > be used directly for allocation as a fast path.
> > > >
> > > > The global percpu cluster variable will never point to a HDD device, and
> > > > allocation on HDD devices is still globally serialized.
> > > >
> > > > This improves the allocator performance and prepares for removal of the
> > > > slot cache in later commits. There shouldn't be much observable behavior
> > > > change, except one thing: this changes how swap device allocation
> > > > rotation works.
> > > >
> > > > Currently, each allocation will rotate the plist, and because of the
> > > > existence of slot cache (64 entries), swap devices of the same priority
> > > > are rotated for every 64 entries consumed. And, high order allocations
> > > > are different, they will bypass the slot cache, and so swap device is
> > > > rotated for every 16K, 32K, or up to 2M allocation.
> > > >
> > > > The rotation rule was never clearly defined or documented, it was changed
> > > > several times without mentioning too.
> > > >
> > > > After this commit, once slot cache is gone in later commits, swap device
> > > > rotation will happen for every consumed cluster. Ideally non-HDD devices
> > > > will be rotated if 2M space has been consumed for each order, this seems
> > >
> > > This breaks the rule where the high priority swap device is always taken
> > > to allocate as long as there's free space in the device. After this patch,
> > > it will try the percpu cluster firstly which is lower priority even though
> > > the higher priority device has free space. However, this only happens when
> > > the higher priority device is exhausted, not a generic case. If this is
> > > expected, it may need be mentioned in log or doc somewhere at least.
> >
> > Hmm, actually this rule was already broken if you are very strict
> > about it. The current percpu slot cache does a pre-allocation, so the
> > high priority device will be removed from the plist while some CPU's
> > slot cache holding usable entries.
> >
> > If the high priority device is exhausted, some CPU's percpu cluster
> > will point to a low priority device indeed, and keep using it until
> > the percpu cluster is drained. I think this should be
> > OK. The high priority device is already full, so the amount of
> > swapouts falls back to low priority device is only a performance
> > issue, I think it's a tiny change for a rare case.
> >
> > >
> > > > reasonable. HDD devices is rotated for every allocation regardless of the
> > > > allocation order, which should be OK and trivial.
> > > >
> > > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > > ---
> > > >  include/linux/swap.h |  11 ++--
> > > >  mm/swapfile.c        | 120 +++++++++++++++++++++++++++----------------
> > > >  2 files changed, 79 insertions(+), 52 deletions(-)
> > > ......
> > > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > > index ae3bd0a862fc..791cd7ed5bdf 100644
> > > > --- a/mm/swapfile.c
> > > > +++ b/mm/swapfile.c
> > > > @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
> > > >
> > > ......snip....
> > > >  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > > >  {
> > > >       int order = swap_entry_order(entry_order);
> > > > @@ -1211,19 +1251,28 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > > >       int n_ret = 0;
> > > >       int node;
> > > >
> > > > +     /* Fast path using percpu cluster */
> > > > +     local_lock(&percpu_swap_cluster.lock);
> > > > +     n_ret = swap_alloc_fast(swp_entries,
> > > > +                             SWAP_HAS_CACHE,
> > > > +                             order, n_goal);
> > > > +     if (n_ret == n_goal)
> > > > +             goto out;
> > > > +
> > > > +     n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
> > >
> > > Here, the behaviour is changed too. In old allocation, partial
> > > allocation will jump out to return. In this patch, you try the percpu
> > > cluster firstly, then call scan_swap_map_slots() to try best and will
> > > jump out even though partial allocation succeed. But the allocation from
> > > scan_swap_map_slots() could happen on different si device, this looks
> > > bizarre. Do you think we need reconsider the design?
> >
> > Right, that's a behavior change, but only temporarily affects slot cache.
> > get_swap_pages will only be called with size > 1 when order == 0, and
> > only by slot cache. (Large order allocation always use size == 1,
> > other users only uses order == 0 && size == 1). So I didn't' notice
> > it, as this series is removing slot cache.
> >
> > The partial side effect would be "returned slots will be from
> > different devices" and "slot_cache may get drained faster as
> > get_swap_pages may return less slots when percpu cluster is drained".
> > Might be a performance issue but seems slight and trivial, slot cache
> > can still work. And the next commit will just remove the slot cache,
> > and the problem will be gone. I think I can add a comment about it
> > here?
>
> By the way, another thing I suddenly think of is the percpu cluster
> becoming glober over all devices. If one order on the stored si of
> percpu cluster is used up, then the whole percpu cluster is drained and
> rewritten. Won't this impact performance compared with the old embedding
> percpu cluster in each si? In log you said "Ideally non-HDD devices
> will be rotated if 2M space has been consumed for each order, this seems
> reasonable.", while in reality it may be very difficult to achieve the
> 'each 2M space has been consumed for each order', but more often happen
> when very few of order's space has been consumed, then rewrite percpu.
> Wonder what I have missed about this point.

Hi Baoquan,

> then the whole percpu cluster is drained and rewritten

Not sure what you mean, SWAP IO doesn't happen in units of clusters,
cluster is only a management unit for slots, so only allocated / freed
slot will be written. Discard is a different thing, and this should
have very little effect on that.

Or you mean the percpu struct array getting updated? It should be even
faster than before, updating a global percpu variable is easier to
calculate the offset at runtime, using GS.

> n reality it may be very difficult to achieve the 'each 2M space has been consumed for each order',

Very true, but notice for order >= 1, slot cache never worked before.
And for order == 0, it's very likely that a cluster will have more
than 64 slots usable. The test result I posted should be a good
example, and device is very full during the test, and performance is
basically identical to before. My only concern was about the device
rotating, as slot cache never worked for order >= 1, so the device
rotates was very frequently. But still seems no one really cared about
it, mthp swapout is a new thing and the previous rotation rule seems
even more confusing than this new idea.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2fe91c293636..a8d84f22357e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -284,12 +284,10 @@  enum swap_cluster_flags {
 #endif
 
 /*
- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
- * its own cluster and swapout sequentially. The purpose is to optimize swapout
- * throughput.
+ * We keep using same cluster for rotating device so swapout will be sequential.
+ * The purpose is to optimize swapout throughput on rotating device.
  */
-struct percpu_cluster {
-	local_lock_t lock; /* Protect the percpu_cluster above */
+struct swap_sequential_cluster {
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
@@ -315,8 +313,7 @@  struct swap_info_struct {
 	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
 	unsigned int pages;		/* total of usable pages of swap */
 	atomic_long_t inuse_pages;	/* number of those currently in use */
-	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
-	struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+	struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
 	spinlock_t global_cluster_lock;	/* Serialize usage of global cluster */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ae3bd0a862fc..791cd7ed5bdf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,6 +116,18 @@  static atomic_t proc_poll_event = ATOMIC_INIT(0);
 
 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
 
+struct percpu_swap_cluster {
+	struct swap_info_struct *si;
+	unsigned long offset[SWAP_NR_ORDERS];
+	local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+	.si = NULL,
+	.offset = { SWAP_ENTRY_INVALID },
+	.lock = INIT_LOCAL_LOCK(),
+};
+
 static struct swap_info_struct *swap_type_to_swap_info(int type)
 {
 	if (type >= MAX_SWAPFILES)
@@ -548,7 +560,7 @@  static bool swap_do_scheduled_discard(struct swap_info_struct *si)
 		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
 		/*
 		 * Delete the cluster from list to prepare for discard, but keep
-		 * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+		 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
 		 * pointing to it, or ran into by relocate_cluster.
 		 */
 		list_del(&ci->list);
@@ -815,10 +827,12 @@  static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	unlock_cluster(ci);
-	if (si->flags & SWP_SOLIDSTATE)
-		__this_cpu_write(si->percpu_cluster->next[order], next);
-	else
+	if (si->flags & SWP_SOLIDSTATE) {
+		__this_cpu_write(percpu_swap_cluster.si, si);
+		__this_cpu_write(percpu_swap_cluster.offset[order], next);
+	} else {
 		si->global_cluster->next[order] = next;
+	}
 	return found;
 }
 
@@ -869,9 +883,8 @@  static void swap_reclaim_work(struct work_struct *work)
 }
 
 /*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
  */
 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
 					      unsigned char usage)
@@ -879,18 +892,12 @@  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 	struct swap_cluster_info *ci;
 	unsigned int offset, found = 0;
 
-	if (si->flags & SWP_SOLIDSTATE) {
-		/* Fast path using per CPU cluster */
-		local_lock(&si->percpu_cluster->lock);
-		offset = __this_cpu_read(si->percpu_cluster->next[order]);
-	} else {
+	if (!(si->flags & SWP_SOLIDSTATE)) {
 		/* Serialize HDD SWAP allocation for each device. */
 		spin_lock(&si->global_cluster_lock);
 		offset = si->global_cluster->next[order];
-	}
-
-	if (offset) {
 		ci = lock_cluster(si, offset);
+
 		/* Cluster could have been used by another order */
 		if (cluster_is_usable(ci, order)) {
 			if (cluster_is_empty(ci))
@@ -980,9 +987,7 @@  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 		}
 	}
 done:
-	if (si->flags & SWP_SOLIDSTATE)
-		local_unlock(&si->percpu_cluster->lock);
-	else
+	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster_lock);
 	return found;
 }
@@ -1203,6 +1208,41 @@  static bool get_swap_device_info(struct swap_info_struct *si)
 	return true;
 }
 
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static int swap_alloc_fast(swp_entry_t entries[],
+			   unsigned char usage,
+			   int order, int n_goal)
+{
+	struct swap_cluster_info *ci;
+	struct swap_info_struct *si;
+	unsigned int offset, found;
+	int n_ret = 0;
+
+	n_goal = min(n_goal, SWAP_BATCH);
+
+	si = __this_cpu_read(percpu_swap_cluster.si);
+	offset = __this_cpu_read(percpu_swap_cluster.offset[order]);
+	if (!si || !offset || !get_swap_device_info(si))
+		return 0;
+
+	while (offset) {
+		ci = lock_cluster(si, offset);
+		found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+		if (!found)
+			break;
+		entries[n_ret++] = swp_entry(si->type, found);
+		if (n_ret == n_goal)
+			break;
+		offset = __this_cpu_read(percpu_swap_cluster.offset[order]);
+	}
+
+	put_swap_device(si);
+	return n_ret;
+}
+
 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 {
 	int order = swap_entry_order(entry_order);
@@ -1211,19 +1251,28 @@  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 	int n_ret = 0;
 	int node;
 
+	/* Fast path using percpu cluster */
+	local_lock(&percpu_swap_cluster.lock);
+	n_ret = swap_alloc_fast(swp_entries,
+				SWAP_HAS_CACHE,
+				order, n_goal);
+	if (n_ret == n_goal)
+		goto out;
+
+	n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
+	/* Rotate the device and switch to a new cluster */
 	spin_lock(&swap_avail_lock);
 start_over:
 	node = numa_node_id();
 	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
-		/* requeue si to after same-priority siblings */
 		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 		spin_unlock(&swap_avail_lock);
 		if (get_swap_device_info(si)) {
-			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
-					n_goal, swp_entries, order);
+			n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
+					swp_entries + n_ret, order);
 			put_swap_device(si);
 			if (n_ret || size > 1)
-				goto check_out;
+				goto out;
 		}
 
 		spin_lock(&swap_avail_lock);
@@ -1241,12 +1290,10 @@  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 		if (plist_node_empty(&next->avail_lists[node]))
 			goto start_over;
 	}
-
 	spin_unlock(&swap_avail_lock);
-
-check_out:
+out:
+	local_unlock(&percpu_swap_cluster.lock);
 	atomic_long_sub(n_ret * size, &nr_swap_pages);
-
 	return n_ret;
 }
 
@@ -2733,8 +2780,6 @@  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	arch_swap_invalidate_area(p->type);
 	zswap_swapoff(p->type);
 	mutex_unlock(&swapon_mutex);
-	free_percpu(p->percpu_cluster);
-	p->percpu_cluster = NULL;
 	kfree(p->global_cluster);
 	p->global_cluster = NULL;
 	vfree(swap_map);
@@ -3133,7 +3178,7 @@  static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 	struct swap_cluster_info *cluster_info;
 	unsigned long i, j, idx;
-	int cpu, err = -ENOMEM;
+	int err = -ENOMEM;
 
 	cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
 	if (!cluster_info)
@@ -3142,20 +3187,7 @@  static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	for (i = 0; i < nr_clusters; i++)
 		spin_lock_init(&cluster_info[i].lock);
 
-	if (si->flags & SWP_SOLIDSTATE) {
-		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
-		if (!si->percpu_cluster)
-			goto err_free;
-
-		for_each_possible_cpu(cpu) {
-			struct percpu_cluster *cluster;
-
-			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
-			for (i = 0; i < SWAP_NR_ORDERS; i++)
-				cluster->next[i] = SWAP_ENTRY_INVALID;
-			local_lock_init(&cluster->lock);
-		}
-	} else {
+	if (!(si->flags & SWP_SOLIDSTATE)) {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
 				     GFP_KERNEL);
 		if (!si->global_cluster)
@@ -3432,8 +3464,6 @@  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap_unlock_inode:
 	inode_unlock(inode);
 bad_swap:
-	free_percpu(si->percpu_cluster);
-	si->percpu_cluster = NULL;
 	kfree(si->global_cluster);
 	si->global_cluster = NULL;
 	inode = NULL;