diff mbox series

[1/2] mm: store zero pages to be swapped out in a bitmap

Message ID 20240530102126.357438-2-usamaarif642@gmail.com (mailing list archive)
State New
Headers show
Series mm: store zero pages to be swapped out in a bitmap | expand

Commit Message

Usama Arif May 30, 2024, 10:19 a.m. UTC
Approximately 10-20% of pages to be swapped out are zero pages [1].
Rather than reading/writing these pages to flash resulting
in increased I/O and flash wear, a bitmap can be used to mark these
pages as zero at write time, and the pages can be filled at
read time if the bit corresponding to the page is set.
With this patch, NVMe writes in Meta server fleet decreased
by almost 10% with conventional swap setup (zswap disabled).

[1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/swap.h |  1 +
 mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
 mm/swapfile.c        | 10 ++++++
 3 files changed, 95 insertions(+), 2 deletions(-)

Comments

Johannes Weiner May 30, 2024, 12:27 p.m. UTC | #1
On Thu, May 30, 2024 at 11:19:07AM +0100, Usama Arif wrote:
> Approximately 10-20% of pages to be swapped out are zero pages [1].
> Rather than reading/writing these pages to flash resulting
> in increased I/O and flash wear, a bitmap can be used to mark these
> pages as zero at write time, and the pages can be filled at
> read time if the bit corresponding to the page is set.
> With this patch, NVMe writes in Meta server fleet decreased
> by almost 10% with conventional swap setup (zswap disabled).
> 
> [1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
> 
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>

This is awesome.

> ---
>  include/linux/swap.h |  1 +
>  mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
>  mm/swapfile.c        | 10 ++++++
>  3 files changed, 95 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index a11c75e897ec..e88563978441 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -299,6 +299,7 @@ struct swap_info_struct {
>  	signed char	type;		/* strange name for an index */
>  	unsigned int	max;		/* extent of the swap_map */
>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
> +	unsigned long *zeromap;		/* vmalloc'ed bitmap to track zero pages */

One bit per swap slot, so 1 / (4096 * 8) = 0.003% static memory
overhead for configured swap space. That seems reasonable for what
appears to be a fairly universal 10% reduction in swap IO.

An alternative implementation would be to reserve a bit in
swap_map. This would be no overhead at idle, but would force
continuation counts earlier on heavily shared page tables, and AFAICS
would get complicated in terms of locking, whereas this one is pretty
simple (atomic ops protect the map, swapcache lock protects the bit).

So I prefer this version. But a few comments below:

>  	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>  	struct swap_cluster_list free_clusters; /* free clusters list */
>  	unsigned int lowest_bit;	/* index of first free in swap_map */
> diff --git a/mm/page_io.c b/mm/page_io.c
> index a360857cf75d..ab043b4ad577 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -172,6 +172,77 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
>  	goto out;
>  }
>  
> +static bool is_folio_page_zero_filled(struct folio *folio, int i)
> +{
> +	unsigned long *page;
> +	unsigned int pos;
> +	bool ret = false;
> +
> +	page = kmap_local_folio(folio, i * PAGE_SIZE);
> +	for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
> +		if (page[pos] != 0)
> +			goto out;
> +	}
> +	ret = true;
> +out:
> +	kunmap_local(page);
> +	return ret;
> +}
> +
> +static bool is_folio_zero_filled(struct folio *folio)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < folio_nr_pages(folio); i++) {
> +		if (!is_folio_page_zero_filled(folio, i))
> +			return false;
> +	}
> +	return true;
> +}
> +
> +static void folio_page_zero_fill(struct folio *folio, int i)
> +{
> +	unsigned long *page;
> +
> +	page = kmap_local_folio(folio, i * PAGE_SIZE);
> +	memset_l(page, 0, PAGE_SIZE / sizeof(unsigned long));
> +	kunmap_local(page);
> +}
> +
> +static void folio_zero_fill(struct folio *folio)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < folio_nr_pages(folio); i++)
> +		folio_page_zero_fill(folio, i);
> +}
> +
> +static void swap_zeromap_folio_set(struct folio *folio)
> +{
> +	struct swap_info_struct *sis = swp_swap_info(folio->swap);
> +	swp_entry_t entry;
> +	unsigned int i;
> +
> +	for (i = 0; i < folio_nr_pages(folio); i++) {
> +		entry = page_swap_entry(folio_page(folio, i));
> +		bitmap_set(sis->zeromap, swp_offset(entry), 1);

This should be set_bit(). bitmap_set() isn't atomic, so it would
corrupt the map on concurrent swapping of other zero pages. And you
don't need a range op here anyway.

> +	}
> +}
> +
> +static bool swap_zeromap_folio_test(struct folio *folio)
> +{
> +	struct swap_info_struct *sis = swp_swap_info(folio->swap);
> +	swp_entry_t entry;
> +	unsigned int i;
> +
> +	for (i = 0; i < folio_nr_pages(folio); i++) {
> +		entry = page_swap_entry(folio_page(folio, i));
> +		if (!test_bit(swp_offset(entry), sis->zeromap))
> +			return false;
> +	}
> +	return true;
> +}
> +
>  /*
>   * We may have stale swap cache pages in memory: notice
>   * them here and get rid of the unnecessary final write.
> @@ -195,6 +266,14 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>  		folio_unlock(folio);
>  		return ret;
>  	}
> +
> +	if (is_folio_zero_filled(folio)) {
> +		swap_zeromap_folio_set(folio);
> +		folio_start_writeback(folio);
> +		folio_unlock(folio);
> +		folio_end_writeback(folio);
> +		return 0;
> +	}

You need to clear the zeromap bit in the else branch.

Userspace can change the contents of a swapcached page, which
redirties the page and forces an overwrite of the slot when the page
is reclaimed again.

So if the page goes from zeroes to something else and then gets
reclaimed again, a subsequent swapin would read the stale zeroes.

>  	if (zswap_store(folio)) {
>  		folio_start_writeback(folio);
>  		folio_unlock(folio);
> @@ -515,8 +594,11 @@ void swap_read_folio(struct folio *folio, bool synchronous,
>  		psi_memstall_enter(&pflags);
>  	}
>  	delayacct_swapin_start();
> -
> -	if (zswap_load(folio)) {
> +	if (swap_zeromap_folio_test(folio)) {
> +		folio_zero_fill(folio);
> +		folio_mark_uptodate(folio);
> +		folio_unlock(folio);
> +	} else if (zswap_load(folio)) {
>  		folio_mark_uptodate(folio);
>  		folio_unlock(folio);
>  	} else if (data_race(sis->flags & SWP_FS_OPS)) {
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index f1e559e216bd..3f00a1cce5ba 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -461,6 +461,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  	 */
>  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> +	bitmap_clear(si->zeromap, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER);

AFAICS this needs to be atomic as well. The swap_info and cluster are
locked, which stabilizes si->swap_map, but zeromap can see updates
from concurrent swap_writepage() and swap_read_folio() on other slots.

I think you need to use a loop over clear_bit(). Please also add a
comment with the above.

>  
>  	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
>  
> @@ -498,6 +499,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  		__free_cluster(si, idx);
>  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  				0, SWAPFILE_CLUSTER);
> +		bitmap_clear(si->zeromap, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER);

Same.

>  		unlock_cluster(ci);
>  	}
>  }
> @@ -1336,6 +1338,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>  	count = p->swap_map[offset];
>  	VM_BUG_ON(count != SWAP_HAS_CACHE);
>  	p->swap_map[offset] = 0;
> +	bitmap_clear(p->zeromap, offset, 1);

This too needs to be atomic, IOW clear_bit().

Otherwise this looks good to me.

>  	dec_cluster_info_page(p, p->cluster_info, offset);
>  	unlock_cluster(ci);
>  
> @@ -2597,6 +2600,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	free_percpu(p->cluster_next_cpu);
>  	p->cluster_next_cpu = NULL;
>  	vfree(swap_map);
> +	bitmap_free(p->zeromap);
>  	kvfree(cluster_info);
>  	/* Destroy swap account information */
>  	swap_cgroup_swapoff(p->type);
> @@ -3123,6 +3127,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  		goto bad_swap_unlock_inode;
>  	}
>  
> +	p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL);
> +	if (!p->zeromap) {
> +		error = -ENOMEM;
> +		goto bad_swap_unlock_inode;
> +	}
> +
>  	if (p->bdev && bdev_stable_writes(p->bdev))
>  		p->flags |= SWP_STABLE_WRITES;
>  
> -- 
> 2.43.0
Yosry Ahmed May 30, 2024, 4:20 p.m. UTC | #2
On Thu, May 30, 2024 at 3:21 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Approximately 10-20% of pages to be swapped out are zero pages [1].
> Rather than reading/writing these pages to flash resulting
> in increased I/O and flash wear, a bitmap can be used to mark these
> pages as zero at write time, and the pages can be filled at
> read time if the bit corresponding to the page is set.
> With this patch, NVMe writes in Meta server fleet decreased
> by almost 10% with conventional swap setup (zswap disabled).

Great work. I thought about doing this after my attempt to drop some
of the same-filled pages handling in zswap, now we can drop all of it
:)

Make sure to CC other non-zswap folks on next iterations like Ying. I
see Johannes already did that in his response. I suspect
get_maintainers will give you a few extra names.

>
> [1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/swap.h |  1 +
>  mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
>  mm/swapfile.c        | 10 ++++++
>  3 files changed, 95 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index a11c75e897ec..e88563978441 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -299,6 +299,7 @@ struct swap_info_struct {
>         signed char     type;           /* strange name for an index */
>         unsigned int    max;            /* extent of the swap_map */
>         unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> +       unsigned long *zeromap;         /* vmalloc'ed bitmap to track zero pages */
>         struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>         struct swap_cluster_list free_clusters; /* free clusters list */
>         unsigned int lowest_bit;        /* index of first free in swap_map */
> diff --git a/mm/page_io.c b/mm/page_io.c
> index a360857cf75d..ab043b4ad577 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -172,6 +172,77 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
>         goto out;
>  }
>
> +static bool is_folio_page_zero_filled(struct folio *folio, int i)
> +{
> +       unsigned long *page;

I just recently renamed this variable in the zswap version of this
function because it is very confusing, especially when looking for
struct page references. 'page' is usually used for struct page. Let's
use a different name here.

> +       unsigned int pos;
> +       bool ret = false;
> +
> +       page = kmap_local_folio(folio, i * PAGE_SIZE);

In the zswap version, we compare against the end of the page first
before the loop, in case the page just has a bunch of zeros at its
beginning. The patch that added it to zswap reported better
performance in some cases [1].

You can also use memchr_inv() to compare the range against 0 instead
of the loop.

[1]https://lore.kernel.org/all/20230205190036.1730134-1-taejoon.song@lge.com/

> +       for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
> +               if (page[pos] != 0)
> +                       goto out;
> +       }
> +       ret = true;
> +out:
> +       kunmap_local(page);
> +       return ret;
> +}
> +
> +static bool is_folio_zero_filled(struct folio *folio)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < folio_nr_pages(folio); i++) {
> +               if (!is_folio_page_zero_filled(folio, i))
> +                       return false;
> +       }
> +       return true;
> +}

Is there value in splitting this into two functions and having a
separate loop here? Can we just have a single function that kmaps the
entire folio and loops over it in one go?

> +
> +static void folio_page_zero_fill(struct folio *folio, int i)
> +{
> +       unsigned long *page;
> +
> +       page = kmap_local_folio(folio, i * PAGE_SIZE);
> +       memset_l(page, 0, PAGE_SIZE / sizeof(unsigned long));
> +       kunmap_local(page);
> +}
> +
> +static void folio_zero_fill(struct folio *folio)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < folio_nr_pages(folio); i++)
> +               folio_page_zero_fill(folio, i);

I think you can just use clear_highpage() here and drop
folio_page_zero_fill(). It should be more optimized as well in some
cases.

I don't have further comments about the rest beyond Johannes's comments.
Yosry Ahmed May 30, 2024, 4:24 p.m. UTC | #3
On Thu, May 30, 2024 at 5:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, May 30, 2024 at 11:19:07AM +0100, Usama Arif wrote:
> > Approximately 10-20% of pages to be swapped out are zero pages [1].
> > Rather than reading/writing these pages to flash resulting
> > in increased I/O and flash wear, a bitmap can be used to mark these
> > pages as zero at write time, and the pages can be filled at
> > read time if the bit corresponding to the page is set.
> > With this patch, NVMe writes in Meta server fleet decreased
> > by almost 10% with conventional swap setup (zswap disabled).
> >
> > [1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
> >
> > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>
> This is awesome.
>
> > ---
> >  include/linux/swap.h |  1 +
> >  mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
> >  mm/swapfile.c        | 10 ++++++
> >  3 files changed, 95 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index a11c75e897ec..e88563978441 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -299,6 +299,7 @@ struct swap_info_struct {
> >       signed char     type;           /* strange name for an index */
> >       unsigned int    max;            /* extent of the swap_map */
> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> > +     unsigned long *zeromap;         /* vmalloc'ed bitmap to track zero pages */
>
> One bit per swap slot, so 1 / (4096 * 8) = 0.003% static memory
> overhead for configured swap space. That seems reasonable for what
> appears to be a fairly universal 10% reduction in swap IO.
>
> An alternative implementation would be to reserve a bit in
> swap_map. This would be no overhead at idle, but would force
> continuation counts earlier on heavily shared page tables, and AFAICS
> would get complicated in terms of locking, whereas this one is pretty
> simple (atomic ops protect the map, swapcache lock protects the bit).
>
> So I prefer this version. But a few comments below:

I am wondering if it's even possible to take this one step further and
avoid reclaiming zero-filled pages in the first place. Can we just
unmap them and let the first read fault allocate a zero'd page like
uninitialized memory, or point them at the zero page and make them
read-only, or something? Then we could free them directly without
going into the swap code to begin with.

That's how I thought about it initially when I attempted to support
only zero-filled pages in zswap. It could be a more complex
implementation though.

[..]
> > +
> > +static void swap_zeromap_folio_set(struct folio *folio)
> > +{
> > +     struct swap_info_struct *sis = swp_swap_info(folio->swap);
> > +     swp_entry_t entry;
> > +     unsigned int i;
> > +
> > +     for (i = 0; i < folio_nr_pages(folio); i++) {
> > +             entry = page_swap_entry(folio_page(folio, i));
> > +             bitmap_set(sis->zeromap, swp_offset(entry), 1);
>
> This should be set_bit(). bitmap_set() isn't atomic, so it would
> corrupt the map on concurrent swapping of other zero pages. And you
> don't need a range op here anyway.

It's a shame there is no range version of set_bit(). I suspect we can
save a few atomic operations on large folios if we write them in
chunks rather than one by one.
Nhat Pham May 30, 2024, 7:18 p.m. UTC | #4
On Thu, May 30, 2024 at 9:24 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Thu, May 30, 2024 at 5:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Thu, May 30, 2024 at 11:19:07AM +0100, Usama Arif wrote:
> > > Approximately 10-20% of pages to be swapped out are zero pages [1].
> > > Rather than reading/writing these pages to flash resulting
> > > in increased I/O and flash wear, a bitmap can be used to mark these
> > > pages as zero at write time, and the pages can be filled at
> > > read time if the bit corresponding to the page is set.
> > > With this patch, NVMe writes in Meta server fleet decreased
> > > by almost 10% with conventional swap setup (zswap disabled).
> > >
> > > [1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
> > >
> > > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >
> > This is awesome.
> >
> > > ---
> > >  include/linux/swap.h |  1 +
> > >  mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
> > >  mm/swapfile.c        | 10 ++++++
> > >  3 files changed, 95 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > > index a11c75e897ec..e88563978441 100644
> > > --- a/include/linux/swap.h
> > > +++ b/include/linux/swap.h
> > > @@ -299,6 +299,7 @@ struct swap_info_struct {
> > >       signed char     type;           /* strange name for an index */
> > >       unsigned int    max;            /* extent of the swap_map */
> > >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> > > +     unsigned long *zeromap;         /* vmalloc'ed bitmap to track zero pages */
> >
> > One bit per swap slot, so 1 / (4096 * 8) = 0.003% static memory
> > overhead for configured swap space. That seems reasonable for what
> > appears to be a fairly universal 10% reduction in swap IO.
> >
> > An alternative implementation would be to reserve a bit in
> > swap_map. This would be no overhead at idle, but would force
> > continuation counts earlier on heavily shared page tables, and AFAICS
> > would get complicated in terms of locking, whereas this one is pretty
> > simple (atomic ops protect the map, swapcache lock protects the bit).
> >
> > So I prefer this version. But a few comments below:
>
> I am wondering if it's even possible to take this one step further and
> avoid reclaiming zero-filled pages in the first place. Can we just
> unmap them and let the first read fault allocate a zero'd page like
> uninitialized memory, or point them at the zero page and make them
> read-only, or something? Then we could free them directly without
> going into the swap code to begin with.
>
> That's how I thought about it initially when I attempted to support
> only zero-filled pages in zswap. It could be a more complex
> implementation though.

We can aim for this eventually, but yeah the implementation will be
more complex. We'll need to be careful in handling shared zero pages,
synchronizing accesses and maintaining reference counts. I think we
will need to special-case swap cache and swap map for these zero pages
(a ghost zero swap device perhaps), or reinvent the wheel to manage
these pieces of information.

Not impossible, but annoying :) For now, I think Usama's approach is
clean enough and does the job.
Yosry Ahmed May 30, 2024, 7:49 p.m. UTC | #5
On Thu, May 30, 2024 at 12:18 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Thu, May 30, 2024 at 9:24 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Thu, May 30, 2024 at 5:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Thu, May 30, 2024 at 11:19:07AM +0100, Usama Arif wrote:
> > > > Approximately 10-20% of pages to be swapped out are zero pages [1].
> > > > Rather than reading/writing these pages to flash resulting
> > > > in increased I/O and flash wear, a bitmap can be used to mark these
> > > > pages as zero at write time, and the pages can be filled at
> > > > read time if the bit corresponding to the page is set.
> > > > With this patch, NVMe writes in Meta server fleet decreased
> > > > by almost 10% with conventional swap setup (zswap disabled).
> > > >
> > > > [1]https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
> > > >
> > > > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >
> > > This is awesome.
> > >
> > > > ---
> > > >  include/linux/swap.h |  1 +
> > > >  mm/page_io.c         | 86 ++++++++++++++++++++++++++++++++++++++++++--
> > > >  mm/swapfile.c        | 10 ++++++
> > > >  3 files changed, 95 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > > > index a11c75e897ec..e88563978441 100644
> > > > --- a/include/linux/swap.h
> > > > +++ b/include/linux/swap.h
> > > > @@ -299,6 +299,7 @@ struct swap_info_struct {
> > > >       signed char     type;           /* strange name for an index */
> > > >       unsigned int    max;            /* extent of the swap_map */
> > > >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> > > > +     unsigned long *zeromap;         /* vmalloc'ed bitmap to track zero pages */
> > >
> > > One bit per swap slot, so 1 / (4096 * 8) = 0.003% static memory
> > > overhead for configured swap space. That seems reasonable for what
> > > appears to be a fairly universal 10% reduction in swap IO.
> > >
> > > An alternative implementation would be to reserve a bit in
> > > swap_map. This would be no overhead at idle, but would force
> > > continuation counts earlier on heavily shared page tables, and AFAICS
> > > would get complicated in terms of locking, whereas this one is pretty
> > > simple (atomic ops protect the map, swapcache lock protects the bit).
> > >
> > > So I prefer this version. But a few comments below:
> >
> > I am wondering if it's even possible to take this one step further and
> > avoid reclaiming zero-filled pages in the first place. Can we just
> > unmap them and let the first read fault allocate a zero'd page like
> > uninitialized memory, or point them at the zero page and make them
> > read-only, or something? Then we could free them directly without
> > going into the swap code to begin with.
> >
> > That's how I thought about it initially when I attempted to support
> > only zero-filled pages in zswap. It could be a more complex
> > implementation though.
>
> We can aim for this eventually, but yeah the implementation will be
> more complex. We'll need to be careful in handling shared zero pages,
> synchronizing accesses and maintaining reference counts. I think we
> will need to special-case swap cache and swap map for these zero pages
> (a ghost zero swap device perhaps), or reinvent the wheel to manage
> these pieces of information.

Isn't there an existing mechanism to have read-only mappings pointing
at the shared zero page, and do COW? Can't we just use that?

I think this is already what we do for mapped areas that were never
written in some cases (see do_anonymous_page()), so it would be just
like that (i.e. as if the mappings were never written). Someone with
more familiarity with this would know better though.

>
> Not impossible, but annoying :) For now, I think Usama's approach is
> clean enough and does the job.

Yeah, I am not against Usama's approach at all. I just want us to
consider both options before we commit to one. If they are close
enough in complexity, it may be worth avoiding swap completely.
Andrew Morton May 30, 2024, 7:58 p.m. UTC | #6
On Thu, 30 May 2024 11:19:07 +0100 Usama Arif <usamaarif642@gmail.com> wrote:

> Approximately 10-20% of pages to be swapped out are zero pages [1].
> Rather than reading/writing these pages to flash resulting
> in increased I/O and flash wear, a bitmap can be used to mark these
> pages as zero at write time, and the pages can be filled at
> read time if the bit corresponding to the page is set.
> With this patch, NVMe writes in Meta server fleet decreased
> by almost 10% with conventional swap setup (zswap disabled).

A little nitlet as you'll be altering the code...

> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -172,6 +172,77 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
>  	goto out;
>  }
>  
> +static bool is_folio_page_zero_filled(struct folio *folio, int i)
> +{
> +	unsigned long *page;
> +	unsigned int pos;
> +	bool ret = false;
> +
> +	page = kmap_local_folio(folio, i * PAGE_SIZE);

It's rather expected that a local variable called `page' has type
`struct page *'.  Can we use something like `addr' here?

> +	for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
> +		if (page[pos] != 0)
> +			goto out;
> +	}
> +	ret = true;
> +out:
> +	kunmap_local(page);
> +	return ret;
> +}
Matthew Wilcox May 30, 2024, 8:04 p.m. UTC | #7
On Thu, May 30, 2024 at 09:24:20AM -0700, Yosry Ahmed wrote:
> I am wondering if it's even possible to take this one step further and
> avoid reclaiming zero-filled pages in the first place. Can we just
> unmap them and let the first read fault allocate a zero'd page like
> uninitialized memory, or point them at the zero page and make them
> read-only, or something? Then we could free them directly without
> going into the swap code to begin with.

I was having similar thoughts.  You can see in do_anonymous_page() that
we simply map the shared zero page when we take a read fault on
unallocated anon memory.

So my question is where are all these zero pages coming from in the Meta
fleet?  Obviously we never try to swap out the shared zero page (it's
not on any LRU list).  So I see three possibilities:

 - Userspace wrote to it, but it wrote zeroes.  Then we did a memcmp(),
   discovered it was zeroes and fall into this path.  It would be safe
   to just discard this page.
 - We allocated it as part of a THP.  We never wrote to this particular
   page of the THP, so it's zero-filled.  While it's safe to just
   discard this page, we might want to write it for better swap-in
   performance.
 - Userspace wrote non-zeroes to it, then wrote zeroes to it before
   abandoning use of this page, and so it eventually got swapped out.
   Perhaps we could teach userspace to MADV_DONTNEED the page instead?

Has any data been gathered on this?  Maybe there are other sources of
zeroed pages that I'm missing.  I do remember a presentation at LSFMM
in 2022 from Google about very sparsely used THPs.
Yosry Ahmed May 30, 2024, 8:16 p.m. UTC | #8
On Thu, May 30, 2024 at 1:04 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, May 30, 2024 at 09:24:20AM -0700, Yosry Ahmed wrote:
> > I am wondering if it's even possible to take this one step further and
> > avoid reclaiming zero-filled pages in the first place. Can we just
> > unmap them and let the first read fault allocate a zero'd page like
> > uninitialized memory, or point them at the zero page and make them
> > read-only, or something? Then we could free them directly without
> > going into the swap code to begin with.
>
> I was having similar thoughts.  You can see in do_anonymous_page() that
> we simply map the shared zero page when we take a read fault on
> unallocated anon memory.
>
> So my question is where are all these zero pages coming from in the Meta
> fleet?  Obviously we never try to swap out the shared zero page (it's
> not on any LRU list).  So I see three possibilities:
>
>  - Userspace wrote to it, but it wrote zeroes.  Then we did a memcmp(),
>    discovered it was zeroes and fall into this path.  It would be safe
>    to just discard this page.
>  - We allocated it as part of a THP.  We never wrote to this particular
>    page of the THP, so it's zero-filled.  While it's safe to just
>    discard this page, we might want to write it for better swap-in
>    performance.

My understanding is that here we check if the entire folio is
zero-filled. If the THP is still intact as a single folio, we will
only apply the optimization if the entire THP is zero-filled. If we
are checking a page that used to be part of a THP, then I think the
THP is already split and swap-in performance would not be affected.

Did I miss anything here?

>  - Userspace wrote non-zeroes to it, then wrote zeroes to it before
>    abandoning use of this page, and so it eventually got swapped out.
>    Perhaps we could teach userspace to MADV_DONTNEED the page instead?

Why wouldn't it be safe to discard the page in this case as well?

>
> Has any data been gathered on this?  Maybe there are other sources of
> zeroed pages that I'm missing.  I do remember a presentation at LSFMM
> in 2022 from Google about very sparsely used THPs.

Apart from that, we may also want to think about shmem if we want a
general approach to avoid swapping out zero pages.
Usama Arif May 31, 2024, 6:18 p.m. UTC | #9
On 30/05/2024 21:04, Matthew Wilcox wrote:
> On Thu, May 30, 2024 at 09:24:20AM -0700, Yosry Ahmed wrote:
>> I am wondering if it's even possible to take this one step further and
>> avoid reclaiming zero-filled pages in the first place. Can we just
>> unmap them and let the first read fault allocate a zero'd page like
>> uninitialized memory, or point them at the zero page and make them
>> read-only, or something? Then we could free them directly without
>> going into the swap code to begin with.
> I was having similar thoughts.  You can see in do_anonymous_page() that
> we simply map the shared zero page when we take a read fault on
> unallocated anon memory.

Thanks Yosry and Matthew. Currently trying to prototype and see how this 
might look. Hopefully should have an update next week.

> So my question is where are all these zero pages coming from in the Meta
> fleet?  Obviously we never try to swap out the shared zero page (it's
> not on any LRU list).  So I see three possibilities:
>
>   - Userspace wrote to it, but it wrote zeroes.  Then we did a memcmp(),
>     discovered it was zeroes and fall into this path.  It would be safe
>     to just discard this page.
>   - We allocated it as part of a THP.  We never wrote to this particular
>     page of the THP, so it's zero-filled.  While it's safe to just
>     discard this page, we might want to write it for better swap-in
>     performance.

Its mostly THP. Alex presented the numbers well in his THP series 
https://lore.kernel.org/lkml/cover.1661461643.git.alexlzhu@fb.com/


>   - Userspace wrote non-zeroes to it, then wrote zeroes to it before
>     abandoning use of this page, and so it eventually got swapped out.
>     Perhaps we could teach userspace to MADV_DONTNEED the page instead?
>
> Has any data been gathered on this?  Maybe there are other sources of
> zeroed pages that I'm missing.  I do remember a presentation at LSFMM
> in 2022 from Google about very sparsely used THPs.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a11c75e897ec..e88563978441 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -299,6 +299,7 @@  struct swap_info_struct {
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	unsigned long *zeromap;		/* vmalloc'ed bitmap to track zero pages */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct swap_cluster_list free_clusters; /* free clusters list */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
diff --git a/mm/page_io.c b/mm/page_io.c
index a360857cf75d..ab043b4ad577 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -172,6 +172,77 @@  int generic_swapfile_activate(struct swap_info_struct *sis,
 	goto out;
 }
 
+static bool is_folio_page_zero_filled(struct folio *folio, int i)
+{
+	unsigned long *page;
+	unsigned int pos;
+	bool ret = false;
+
+	page = kmap_local_folio(folio, i * PAGE_SIZE);
+	for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos] != 0)
+			goto out;
+	}
+	ret = true;
+out:
+	kunmap_local(page);
+	return ret;
+}
+
+static bool is_folio_zero_filled(struct folio *folio)
+{
+	unsigned int i;
+
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		if (!is_folio_page_zero_filled(folio, i))
+			return false;
+	}
+	return true;
+}
+
+static void folio_page_zero_fill(struct folio *folio, int i)
+{
+	unsigned long *page;
+
+	page = kmap_local_folio(folio, i * PAGE_SIZE);
+	memset_l(page, 0, PAGE_SIZE / sizeof(unsigned long));
+	kunmap_local(page);
+}
+
+static void folio_zero_fill(struct folio *folio)
+{
+	unsigned int i;
+
+	for (i = 0; i < folio_nr_pages(folio); i++)
+		folio_page_zero_fill(folio, i);
+}
+
+static void swap_zeromap_folio_set(struct folio *folio)
+{
+	struct swap_info_struct *sis = swp_swap_info(folio->swap);
+	swp_entry_t entry;
+	unsigned int i;
+
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		entry = page_swap_entry(folio_page(folio, i));
+		bitmap_set(sis->zeromap, swp_offset(entry), 1);
+	}
+}
+
+static bool swap_zeromap_folio_test(struct folio *folio)
+{
+	struct swap_info_struct *sis = swp_swap_info(folio->swap);
+	swp_entry_t entry;
+	unsigned int i;
+
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		entry = page_swap_entry(folio_page(folio, i));
+		if (!test_bit(swp_offset(entry), sis->zeromap))
+			return false;
+	}
+	return true;
+}
+
 /*
  * We may have stale swap cache pages in memory: notice
  * them here and get rid of the unnecessary final write.
@@ -195,6 +266,14 @@  int swap_writepage(struct page *page, struct writeback_control *wbc)
 		folio_unlock(folio);
 		return ret;
 	}
+
+	if (is_folio_zero_filled(folio)) {
+		swap_zeromap_folio_set(folio);
+		folio_start_writeback(folio);
+		folio_unlock(folio);
+		folio_end_writeback(folio);
+		return 0;
+	}
 	if (zswap_store(folio)) {
 		folio_start_writeback(folio);
 		folio_unlock(folio);
@@ -515,8 +594,11 @@  void swap_read_folio(struct folio *folio, bool synchronous,
 		psi_memstall_enter(&pflags);
 	}
 	delayacct_swapin_start();
-
-	if (zswap_load(folio)) {
+	if (swap_zeromap_folio_test(folio)) {
+		folio_zero_fill(folio);
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+	} else if (zswap_load(folio)) {
 		folio_mark_uptodate(folio);
 		folio_unlock(folio);
 	} else if (data_race(sis->flags & SWP_FS_OPS)) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e559e216bd..3f00a1cce5ba 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -461,6 +461,7 @@  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 	 */
 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+	bitmap_clear(si->zeromap, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER);
 
 	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
 
@@ -498,6 +499,7 @@  static void swap_do_scheduled_discard(struct swap_info_struct *si)
 		__free_cluster(si, idx);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
+		bitmap_clear(si->zeromap, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER);
 		unlock_cluster(ci);
 	}
 }
@@ -1336,6 +1338,7 @@  static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
 	count = p->swap_map[offset];
 	VM_BUG_ON(count != SWAP_HAS_CACHE);
 	p->swap_map[offset] = 0;
+	bitmap_clear(p->zeromap, offset, 1);
 	dec_cluster_info_page(p, p->cluster_info, offset);
 	unlock_cluster(ci);
 
@@ -2597,6 +2600,7 @@  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
 	vfree(swap_map);
+	bitmap_free(p->zeromap);
 	kvfree(cluster_info);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
@@ -3123,6 +3127,12 @@  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		goto bad_swap_unlock_inode;
 	}
 
+	p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL);
+	if (!p->zeromap) {
+		error = -ENOMEM;
+		goto bad_swap_unlock_inode;
+	}
+
 	if (p->bdev && bdev_stable_writes(p->bdev))
 		p->flags |= SWP_STABLE_WRITES;