diff mbox

[RFC,0/4] mm: zswap: add support for zswapin of large folios

Message ID 20241018105026.2521366-1-usamaarif642@gmail.com (mailing list archive)
State New
Headers show

Commit Message

Usama Arif Oct. 18, 2024, 10:48 a.m. UTC
After large folio zswapout support added in [1], this patch adds
support for zswapin of large folios to bring it on par with zram.
This series makes sure that the benefits of large folios (fewer
page faults, batched PTE and rmap manipulation, reduced lru list,
TLB coalescing (for arm64 and amd)) are not lost at swap out when
using zswap.

It builds on top of [2] which added large folio swapin support for
zram and provides the same level of large folio swapin support as
zram, i.e. only supporting swap count == 1.

Patch 1 skips swapcache for swapping in zswap pages, this should improve
no readahead swapin performance [3], and also allows us to build on large
folio swapin support added in [2], hence is a prerequisite for patch 3.

Patch 3 adds support for large folio zswapin. This patch does not add
support for hybrid backends (i.e. folios partly present swap and zswap).

The main performance benefit comes from maintaining large folios *after*
swapin, large folio performance improvements have been mentioned in previous
series posted on it [2],[4], so have not added those. Below is a simple
microbenchmark to measure the time needed *for* zswpin of 1G memory (along
with memory integrity check).

                                |  no mTHP (ms) | 1M mTHP enabled (ms)
Base kernel                     |   1165        |    1163
Kernel with mTHP zswpin series  |   1203        |     738

The time measured was pretty consistent between runs (~1-2% variation).
There is 36% improvement in zswapin time with 1M folios. The percentage
improvement is likely to be more if the memcmp is removed.

        T(test_zswapin),
        T(test_zswap_writeback_enabled),
        T(test_zswap_writeback_disabled),
+       T(test_zswapin_perf),
        T(test_no_kmem_bypass),
        T(test_no_invasive_cgroup_shrink),
 };

[1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
[2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
[3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
[4] https://lwn.net/Articles/955575/

Usama Arif (4):
  mm/zswap: skip swapcache for swapping in zswap pages
  mm/zswap: modify zswap_decompress to accept page instead of folio
  mm/zswap: add support for large folio zswapin
  mm/zswap: count successful large folio zswap loads

 Documentation/admin-guide/mm/transhuge.rst |   3 +
 include/linux/huge_mm.h                    |   1 +
 include/linux/zswap.h                      |   6 ++
 mm/huge_memory.c                           |   3 +
 mm/memory.c                                |  16 +--
 mm/page_io.c                               |   2 +-
 mm/zswap.c                                 | 120 ++++++++++++++-------
 7 files changed, 99 insertions(+), 52 deletions(-)
diff mbox

Patch

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 40de679248b8..77068c577c86 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -9,6 +9,8 @@ 
 #include <string.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <malloc.h>
 
 #include "../kselftest.h"
 #include "cgroup_util.h"
@@ -407,6 +409,74 @@  static int test_zswap_writeback_disabled(const char *root)
        return test_zswap_writeback(root, false);
 }
 
+static int zswapin_perf(const char *cgroup, void *arg)
+{
+       long pagesize = sysconf(_SC_PAGESIZE);
+       size_t memsize = MB(1*1024);
+       char buf[pagesize];
+       int ret = -1;
+       char *mem;
+       struct timeval start, end;
+
+       mem = (char *)memalign(2*1024*1024, memsize);
+       if (!mem)
+               return ret;
+
+       /*
+        * Fill half of each page with increasing data, and keep other
+        * half empty, this will result in data that is still compressible
+        * and ends up in zswap, with material zswap usage.
+        */
+       for (int i = 0; i < pagesize; i++)
+               buf[i] = i < pagesize/2 ? (char) i : 0;
+
+       for (int i = 0; i < memsize; i += pagesize)
+               memcpy(&mem[i], buf, pagesize);
+
+       /* Try and reclaim allocated memory */
+       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
+               ksft_print_msg("Failed to reclaim all of the requested memory\n");
+               goto out;
+       }
+
+       gettimeofday(&start, NULL);
+       /* zswpin */
+       for (int i = 0; i < memsize; i += pagesize) {
+               if (memcmp(&mem[i], buf, pagesize)) {
+                       ksft_print_msg("invalid memory\n");
+                       goto out;
+               }
+       }
+       gettimeofday(&end, NULL);
+       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
+       ret = 0;
+out:
+       free(mem);
+       return ret;
+}
+
+static int test_zswapin_perf(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *test_group;
+
+       test_group = cg_name(root, "zswapin_perf_test");
+       if (!test_group)
+               goto out;
+       if (cg_create(test_group))
+               goto out;
+
+       if (cg_run(test_group, zswapin_perf, NULL))
+               goto out;
+
+       ret = KSFT_PASS;
+out:
+       cg_destroy(test_group);
+       free(test_group);
+       return ret;
+}
+
 /*
  * When trying to store a memcg page in zswap, if the memcg hits its memory
  * limit in zswap, writeback should affect only the zswapped pages of that
@@ -584,6 +654,7 @@  struct zswap_test {